summaryrefslogtreecommitdiff
path: root/fs/btrfs/volumes.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs/volumes.c')
-rw-r--r--fs/btrfs/volumes.c895
1 files changed, 627 insertions, 268 deletions
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 1718e1a5c320..309a57b9fc85 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -22,6 +22,7 @@
#include <linux/blkdev.h>
#include <linux/random.h>
#include <linux/iocontext.h>
+#include <linux/capability.h>
#include <asm/div64.h>
#include "compat.h"
#include "ctree.h"
@@ -32,17 +33,6 @@
#include "volumes.h"
#include "async-thread.h"
-struct map_lookup {
- u64 type;
- int io_align;
- int io_width;
- int stripe_len;
- int sector_size;
- int num_stripes;
- int sub_stripes;
- struct btrfs_bio_stripe stripes[];
-};
-
static int init_first_rw_device(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_device *device);
@@ -161,7 +151,6 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
struct bio *cur;
int again = 0;
unsigned long num_run;
- unsigned long num_sync_run;
unsigned long batch_run = 0;
unsigned long limit;
unsigned long last_waited = 0;
@@ -172,11 +161,6 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
limit = btrfs_async_submit_limit(fs_info);
limit = limit * 2 / 3;
- /* we want to make sure that every time we switch from the sync
- * list to the normal list, we unplug
- */
- num_sync_run = 0;
-
loop:
spin_lock(&device->io_lock);
@@ -222,15 +206,6 @@ loop_lock:
spin_unlock(&device->io_lock);
- /*
- * if we're doing the regular priority list, make sure we unplug
- * for any high prio bios we've sent down
- */
- if (pending_bios == &device->pending_bios && num_sync_run > 0) {
- num_sync_run = 0;
- blk_run_backing_dev(bdi, NULL);
- }
-
while (pending) {
rmb();
@@ -258,19 +233,11 @@ loop_lock:
BUG_ON(atomic_read(&cur->bi_cnt) == 0);
- if (cur->bi_rw & REQ_SYNC)
- num_sync_run++;
-
submit_bio(cur->bi_rw, cur);
num_run++;
batch_run++;
- if (need_resched()) {
- if (num_sync_run) {
- blk_run_backing_dev(bdi, NULL);
- num_sync_run = 0;
- }
+ if (need_resched())
cond_resched();
- }
/*
* we made progress, there is more work to do and the bdi
@@ -303,13 +270,8 @@ loop_lock:
* against it before looping
*/
last_waited = ioc->last_waited;
- if (need_resched()) {
- if (num_sync_run) {
- blk_run_backing_dev(bdi, NULL);
- num_sync_run = 0;
- }
+ if (need_resched())
cond_resched();
- }
continue;
}
spin_lock(&device->io_lock);
@@ -322,22 +284,6 @@ loop_lock:
}
}
- if (num_sync_run) {
- num_sync_run = 0;
- blk_run_backing_dev(bdi, NULL);
- }
- /*
- * IO has already been through a long path to get here. Checksumming,
- * async helper threads, perhaps compression. We've done a pretty
- * good job of collecting a batch of IO and should just unplug
- * the device right away.
- *
- * This will help anyone who is waiting on the IO, they might have
- * already unplugged, but managed to do so before the bio they
- * cared about found its way down here.
- */
- blk_run_backing_dev(bdi, NULL);
-
cond_resched();
if (again)
goto loop;
@@ -600,8 +546,10 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
set_blocksize(bdev, 4096);
bh = btrfs_read_dev_super(bdev);
- if (!bh)
+ if (!bh) {
+ ret = -EINVAL;
goto error_close;
+ }
disk_super = (struct btrfs_super_block *)bh->b_data;
devid = btrfs_stack_device_id(&disk_super->dev_item);
@@ -703,7 +651,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
goto error_close;
bh = btrfs_read_dev_super(bdev);
if (!bh) {
- ret = -EIO;
+ ret = -EINVAL;
goto error_close;
}
disk_super = (struct btrfs_super_block *)bh->b_data;
@@ -729,59 +677,167 @@ error:
return ret;
}
+/* helper to account the used device space in the range */
+int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
+ u64 end, u64 *length)
+{
+ struct btrfs_key key;
+ struct btrfs_root *root = device->dev_root;
+ struct btrfs_dev_extent *dev_extent;
+ struct btrfs_path *path;
+ u64 extent_end;
+ int ret;
+ int slot;
+ struct extent_buffer *l;
+
+ *length = 0;
+
+ if (start >= device->total_bytes)
+ return 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+ path->reada = 2;
+
+ key.objectid = device->devid;
+ key.offset = start;
+ key.type = BTRFS_DEV_EXTENT_KEY;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+ if (ret > 0) {
+ ret = btrfs_previous_item(root, path, key.objectid, key.type);
+ if (ret < 0)
+ goto out;
+ }
+
+ while (1) {
+ l = path->nodes[0];
+ slot = path->slots[0];
+ if (slot >= btrfs_header_nritems(l)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret == 0)
+ continue;
+ if (ret < 0)
+ goto out;
+
+ break;
+ }
+ btrfs_item_key_to_cpu(l, &key, slot);
+
+ if (key.objectid < device->devid)
+ goto next;
+
+ if (key.objectid > device->devid)
+ break;
+
+ if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
+ goto next;
+
+ dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
+ extent_end = key.offset + btrfs_dev_extent_length(l,
+ dev_extent);
+ if (key.offset <= start && extent_end > end) {
+ *length = end - start + 1;
+ break;
+ } else if (key.offset <= start && extent_end > start)
+ *length += extent_end - start;
+ else if (key.offset > start && extent_end <= end)
+ *length += extent_end - key.offset;
+ else if (key.offset > start && key.offset <= end) {
+ *length += end - key.offset + 1;
+ break;
+ } else if (key.offset > end)
+ break;
+
+next:
+ path->slots[0]++;
+ }
+ ret = 0;
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
/*
+ * find_free_dev_extent - find free space in the specified device
+ * @trans: transaction handler
+ * @device: the device which we search the free space in
+ * @num_bytes: the size of the free space that we need
+ * @start: store the start of the free space.
+ * @len: the size of the free space. that we find, or the size of the max
+ * free space if we don't find suitable free space
+ *
* this uses a pretty simple search, the expectation is that it is
* called very infrequently and that a given device has a small number
* of extents
+ *
+ * @start is used to store the start of the free space if we find. But if we
+ * don't find suitable free space, it will be used to store the start position
+ * of the max free space.
+ *
+ * @len is used to store the size of the free space that we find.
+ * But if we don't find suitable free space, it is used to store the size of
+ * the max free space.
*/
int find_free_dev_extent(struct btrfs_trans_handle *trans,
struct btrfs_device *device, u64 num_bytes,
- u64 *start, u64 *max_avail)
+ u64 *start, u64 *len)
{
struct btrfs_key key;
struct btrfs_root *root = device->dev_root;
- struct btrfs_dev_extent *dev_extent = NULL;
+ struct btrfs_dev_extent *dev_extent;
struct btrfs_path *path;
- u64 hole_size = 0;
- u64 last_byte = 0;
- u64 search_start = 0;
+ u64 hole_size;
+ u64 max_hole_start;
+ u64 max_hole_size;
+ u64 extent_end;
+ u64 search_start;
u64 search_end = device->total_bytes;
int ret;
- int slot = 0;
- int start_found;
+ int slot;
struct extent_buffer *l;
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
- path->reada = 2;
- start_found = 0;
-
/* FIXME use last free of some kind */
/* we don't want to overwrite the superblock on the drive,
* so we make sure to start at an offset of at least 1MB
*/
- search_start = max((u64)1024 * 1024, search_start);
+ search_start = 1024 * 1024;
- if (root->fs_info->alloc_start + num_bytes <= device->total_bytes)
+ if (root->fs_info->alloc_start + num_bytes <= search_end)
search_start = max(root->fs_info->alloc_start, search_start);
+ max_hole_start = search_start;
+ max_hole_size = 0;
+
+ if (search_start >= search_end) {
+ ret = -ENOSPC;
+ goto error;
+ }
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto error;
+ }
+ path->reada = 2;
+
key.objectid = device->devid;
key.offset = search_start;
key.type = BTRFS_DEV_EXTENT_KEY;
+
ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
if (ret < 0)
- goto error;
+ goto out;
if (ret > 0) {
ret = btrfs_previous_item(root, path, key.objectid, key.type);
if (ret < 0)
- goto error;
- if (ret > 0)
- start_found = 1;
+ goto out;
}
- l = path->nodes[0];
- btrfs_item_key_to_cpu(l, &key, path->slots[0]);
+
while (1) {
l = path->nodes[0];
slot = path->slots[0];
@@ -790,24 +846,9 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans,
if (ret == 0)
continue;
if (ret < 0)
- goto error;
-no_more_items:
- if (!start_found) {
- if (search_start >= search_end) {
- ret = -ENOSPC;
- goto error;
- }
- *start = search_start;
- start_found = 1;
- goto check_pending;
- }
- *start = last_byte > search_start ?
- last_byte : search_start;
- if (search_end <= *start) {
- ret = -ENOSPC;
- goto error;
- }
- goto check_pending;
+ goto out;
+
+ break;
}
btrfs_item_key_to_cpu(l, &key, slot);
@@ -815,48 +856,62 @@ no_more_items:
goto next;
if (key.objectid > device->devid)
- goto no_more_items;
+ break;
- if (key.offset >= search_start && key.offset > last_byte &&
- start_found) {
- if (last_byte < search_start)
- last_byte = search_start;
- hole_size = key.offset - last_byte;
+ if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
+ goto next;
- if (hole_size > *max_avail)
- *max_avail = hole_size;
+ if (key.offset > search_start) {
+ hole_size = key.offset - search_start;
- if (key.offset > last_byte &&
- hole_size >= num_bytes) {
- *start = last_byte;
- goto check_pending;
+ if (hole_size > max_hole_size) {
+ max_hole_start = search_start;
+ max_hole_size = hole_size;
+ }
+
+ /*
+ * If this free space is greater than which we need,
+ * it must be the max free space that we have found
+ * until now, so max_hole_start must point to the start
+ * of this free space and the length of this free space
+ * is stored in max_hole_size. Thus, we return
+ * max_hole_start and max_hole_size and go back to the
+ * caller.
+ */
+ if (hole_size >= num_bytes) {
+ ret = 0;
+ goto out;
}
}
- if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
- goto next;
- start_found = 1;
dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
- last_byte = key.offset + btrfs_dev_extent_length(l, dev_extent);
+ extent_end = key.offset + btrfs_dev_extent_length(l,
+ dev_extent);
+ if (extent_end > search_start)
+ search_start = extent_end;
next:
path->slots[0]++;
cond_resched();
}
-check_pending:
- /* we have to make sure we didn't find an extent that has already
- * been allocated by the map tree or the original allocation
- */
- BUG_ON(*start < search_start);
- if (*start + num_bytes > search_end) {
- ret = -ENOSPC;
- goto error;
+ hole_size = search_end- search_start;
+ if (hole_size > max_hole_size) {
+ max_hole_start = search_start;
+ max_hole_size = hole_size;
}
- /* check for pending inserts here */
- ret = 0;
-error:
+ /* See above. */
+ if (hole_size < num_bytes)
+ ret = -ENOSPC;
+ else
+ ret = 0;
+
+out:
btrfs_free_path(path);
+error:
+ *start = max_hole_start;
+ if (len)
+ *len = max_hole_size;
return ret;
}
@@ -1103,6 +1158,10 @@ static int btrfs_rm_dev_item(struct btrfs_root *root,
return -ENOMEM;
trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans)) {
+ btrfs_free_path(path);
+ return PTR_ERR(trans);
+ }
key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
key.type = BTRFS_DEV_ITEM_KEY;
key.offset = device->devid;
@@ -1196,7 +1255,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
set_blocksize(bdev, 4096);
bh = btrfs_read_dev_super(bdev);
if (!bh) {
- ret = -EIO;
+ ret = -EINVAL;
goto error_close;
}
disk_super = (struct btrfs_super_block *)bh->b_data;
@@ -1224,11 +1283,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
ret = btrfs_shrink_device(device, 0);
if (ret)
- goto error_brelse;
+ goto error_undo;
ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device);
if (ret)
- goto error_brelse;
+ goto error_undo;
device->in_fs_metadata = 0;
@@ -1302,6 +1361,13 @@ out:
mutex_unlock(&root->fs_info->volume_mutex);
mutex_unlock(&uuid_mutex);
return ret;
+error_undo:
+ if (device->writeable) {
+ list_add(&device->dev_alloc_list,
+ &root->fs_info->fs_devices->alloc_list);
+ root->fs_info->fs_devices->rw_devices++;
+ }
+ goto error_brelse;
}
/*
@@ -1491,11 +1557,19 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
ret = find_next_devid(root, &device->devid);
if (ret) {
+ kfree(device->name);
kfree(device);
goto error;
}
trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans)) {
+ kfree(device->name);
+ kfree(device);
+ ret = PTR_ERR(trans);
+ goto error;
+ }
+
lock_chunks(root);
device->writeable = 1;
@@ -1511,7 +1585,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
device->dev_root = root->fs_info->dev_root;
device->bdev = bdev;
device->in_fs_metadata = 1;
- device->mode = 0;
+ device->mode = FMODE_EXCL;
set_blocksize(device->bdev, 4096);
if (seeding_dev) {
@@ -1763,7 +1837,7 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
return ret;
trans = btrfs_start_transaction(root, 0);
- BUG_ON(!trans);
+ BUG_ON(IS_ERR(trans));
lock_chunks(root);
@@ -1794,6 +1868,8 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
BUG_ON(ret);
+ trace_btrfs_chunk_free(root, map, chunk_offset, em->len);
+
if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset);
BUG_ON(ret);
@@ -1916,6 +1992,9 @@ int btrfs_balance(struct btrfs_root *dev_root)
if (dev_root->fs_info->sb->s_flags & MS_RDONLY)
return -EROFS;
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
mutex_lock(&dev_root->fs_info->volume_mutex);
dev_root = dev_root->fs_info->dev_root;
@@ -1934,7 +2013,7 @@ int btrfs_balance(struct btrfs_root *dev_root)
BUG_ON(ret);
trans = btrfs_start_transaction(dev_root, 0);
- BUG_ON(!trans);
+ BUG_ON(IS_ERR(trans));
ret = btrfs_grow_device(trans, device, old_size);
BUG_ON(ret);
@@ -2100,6 +2179,11 @@ again:
/* Shrinking succeeded, else we would be at "done". */
trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto done;
+ }
+
lock_chunks(root);
device->disk_total_bytes = new_size;
@@ -2154,66 +2238,67 @@ static noinline u64 chunk_bytes_by_type(u64 type, u64 calc_size,
return calc_size * num_stripes;
}
-static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
- struct btrfs_root *extent_root,
- struct map_lookup **map_ret,
- u64 *num_bytes, u64 *stripe_size,
- u64 start, u64 type)
+/* Used to sort the devices by max_avail(descending sort) */
+int btrfs_cmp_device_free_bytes(const void *dev_info1, const void *dev_info2)
{
- struct btrfs_fs_info *info = extent_root->fs_info;
- struct btrfs_device *device = NULL;
- struct btrfs_fs_devices *fs_devices = info->fs_devices;
- struct list_head *cur;
- struct map_lookup *map = NULL;
- struct extent_map_tree *em_tree;
- struct extent_map *em;
- struct list_head private_devs;
- int min_stripe_size = 1 * 1024 * 1024;
- u64 calc_size = 1024 * 1024 * 1024;
- u64 max_chunk_size = calc_size;
- u64 min_free;
- u64 avail;
- u64 max_avail = 0;
- u64 dev_offset;
- int num_stripes = 1;
- int min_stripes = 1;
- int sub_stripes = 0;
- int looped = 0;
- int ret;
- int index;
- int stripe_len = 64 * 1024;
+ if (((struct btrfs_device_info *)dev_info1)->max_avail >
+ ((struct btrfs_device_info *)dev_info2)->max_avail)
+ return -1;
+ else if (((struct btrfs_device_info *)dev_info1)->max_avail <
+ ((struct btrfs_device_info *)dev_info2)->max_avail)
+ return 1;
+ else
+ return 0;
+}
- if ((type & BTRFS_BLOCK_GROUP_RAID1) &&
- (type & BTRFS_BLOCK_GROUP_DUP)) {
- WARN_ON(1);
- type &= ~BTRFS_BLOCK_GROUP_DUP;
- }
- if (list_empty(&fs_devices->alloc_list))
- return -ENOSPC;
+static int __btrfs_calc_nstripes(struct btrfs_fs_devices *fs_devices, u64 type,
+ int *num_stripes, int *min_stripes,
+ int *sub_stripes)
+{
+ *num_stripes = 1;
+ *min_stripes = 1;
+ *sub_stripes = 0;
if (type & (BTRFS_BLOCK_GROUP_RAID0)) {
- num_stripes = fs_devices->rw_devices;
- min_stripes = 2;
+ *num_stripes = fs_devices->rw_devices;
+ *min_stripes = 2;
}
if (type & (BTRFS_BLOCK_GROUP_DUP)) {
- num_stripes = 2;
- min_stripes = 2;
+ *num_stripes = 2;
+ *min_stripes = 2;
}
if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
if (fs_devices->rw_devices < 2)
return -ENOSPC;
- num_stripes = 2;
- min_stripes = 2;
+ *num_stripes = 2;
+ *min_stripes = 2;
}
if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
- num_stripes = fs_devices->rw_devices;
- if (num_stripes < 4)
+ *num_stripes = fs_devices->rw_devices;
+ if (*num_stripes < 4)
return -ENOSPC;
- num_stripes &= ~(u32)1;
- sub_stripes = 2;
- min_stripes = 4;
+ *num_stripes &= ~(u32)1;
+ *sub_stripes = 2;
+ *min_stripes = 4;
}
+ return 0;
+}
+
+static u64 __btrfs_calc_stripe_size(struct btrfs_fs_devices *fs_devices,
+ u64 proposed_size, u64 type,
+ int num_stripes, int small_stripe)
+{
+ int min_stripe_size = 1 * 1024 * 1024;
+ u64 calc_size = proposed_size;
+ u64 max_chunk_size = calc_size;
+ int ncopies = 1;
+
+ if (type & (BTRFS_BLOCK_GROUP_RAID1 |
+ BTRFS_BLOCK_GROUP_DUP |
+ BTRFS_BLOCK_GROUP_RAID10))
+ ncopies = 2;
+
if (type & BTRFS_BLOCK_GROUP_DATA) {
max_chunk_size = 10 * calc_size;
min_stripe_size = 64 * 1024 * 1024;
@@ -2230,51 +2315,209 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
max_chunk_size);
-again:
- max_avail = 0;
- if (!map || map->num_stripes != num_stripes) {
- kfree(map);
- map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
- if (!map)
- return -ENOMEM;
- map->num_stripes = num_stripes;
- }
-
- if (calc_size * num_stripes > max_chunk_size) {
- calc_size = max_chunk_size;
+ if (calc_size * num_stripes > max_chunk_size * ncopies) {
+ calc_size = max_chunk_size * ncopies;
do_div(calc_size, num_stripes);
- do_div(calc_size, stripe_len);
- calc_size *= stripe_len;
+ do_div(calc_size, BTRFS_STRIPE_LEN);
+ calc_size *= BTRFS_STRIPE_LEN;
}
/* we don't want tiny stripes */
- if (!looped)
+ if (!small_stripe)
calc_size = max_t(u64, min_stripe_size, calc_size);
/*
- * we're about to do_div by the stripe_len so lets make sure
+ * we're about to do_div by the BTRFS_STRIPE_LEN so lets make sure
* we end up with something bigger than a stripe
*/
- calc_size = max_t(u64, calc_size, stripe_len * 4);
+ calc_size = max_t(u64, calc_size, BTRFS_STRIPE_LEN);
+
+ do_div(calc_size, BTRFS_STRIPE_LEN);
+ calc_size *= BTRFS_STRIPE_LEN;
+
+ return calc_size;
+}
+
+static struct map_lookup *__shrink_map_lookup_stripes(struct map_lookup *map,
+ int num_stripes)
+{
+ struct map_lookup *new;
+ size_t len = map_lookup_size(num_stripes);
+
+ BUG_ON(map->num_stripes < num_stripes);
+
+ if (map->num_stripes == num_stripes)
+ return map;
+
+ new = kmalloc(len, GFP_NOFS);
+ if (!new) {
+ /* just change map->num_stripes */
+ map->num_stripes = num_stripes;
+ return map;
+ }
+
+ memcpy(new, map, len);
+ new->num_stripes = num_stripes;
+ kfree(map);
+ return new;
+}
+
+/*
+ * helper to allocate device space from btrfs_device_info, in which we stored
+ * max free space information of every device. It is used when we can not
+ * allocate chunks by default size.
+ *
+ * By this helper, we can allocate a new chunk as larger as possible.
+ */
+static int __btrfs_alloc_tiny_space(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_devices *fs_devices,
+ struct btrfs_device_info *devices,
+ int nr_device, u64 type,
+ struct map_lookup **map_lookup,
+ int min_stripes, u64 *stripe_size)
+{
+ int i, index, sort_again = 0;
+ int min_devices = min_stripes;
+ u64 max_avail, min_free;
+ struct map_lookup *map = *map_lookup;
+ int ret;
+
+ if (nr_device < min_stripes)
+ return -ENOSPC;
+
+ btrfs_descending_sort_devices(devices, nr_device);
+
+ max_avail = devices[0].max_avail;
+ if (!max_avail)
+ return -ENOSPC;
+
+ for (i = 0; i < nr_device; i++) {
+ /*
+ * if dev_offset = 0, it means the free space of this device
+ * is less than what we need, and we didn't search max avail
+ * extent on this device, so do it now.
+ */
+ if (!devices[i].dev_offset) {
+ ret = find_free_dev_extent(trans, devices[i].dev,
+ max_avail,
+ &devices[i].dev_offset,
+ &devices[i].max_avail);
+ if (ret != 0 && ret != -ENOSPC)
+ return ret;
+ sort_again = 1;
+ }
+ }
+
+ /* we update the max avail free extent of each devices, sort again */
+ if (sort_again)
+ btrfs_descending_sort_devices(devices, nr_device);
+
+ if (type & BTRFS_BLOCK_GROUP_DUP)
+ min_devices = 1;
+
+ if (!devices[min_devices - 1].max_avail)
+ return -ENOSPC;
+
+ max_avail = devices[min_devices - 1].max_avail;
+ if (type & BTRFS_BLOCK_GROUP_DUP)
+ do_div(max_avail, 2);
+
+ max_avail = __btrfs_calc_stripe_size(fs_devices, max_avail, type,
+ min_stripes, 1);
+ if (type & BTRFS_BLOCK_GROUP_DUP)
+ min_free = max_avail * 2;
+ else
+ min_free = max_avail;
+
+ if (min_free > devices[min_devices - 1].max_avail)
+ return -ENOSPC;
+
+ map = __shrink_map_lookup_stripes(map, min_stripes);
+ *stripe_size = max_avail;
+
+ index = 0;
+ for (i = 0; i < min_stripes; i++) {
+ map->stripes[i].dev = devices[index].dev;
+ map->stripes[i].physical = devices[index].dev_offset;
+ if (type & BTRFS_BLOCK_GROUP_DUP) {
+ i++;
+ map->stripes[i].dev = devices[index].dev;
+ map->stripes[i].physical = devices[index].dev_offset +
+ max_avail;
+ }
+ index++;
+ }
+ *map_lookup = map;
+
+ return 0;
+}
+
+static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
+ struct btrfs_root *extent_root,
+ struct map_lookup **map_ret,
+ u64 *num_bytes, u64 *stripe_size,
+ u64 start, u64 type)
+{
+ struct btrfs_fs_info *info = extent_root->fs_info;
+ struct btrfs_device *device = NULL;
+ struct btrfs_fs_devices *fs_devices = info->fs_devices;
+ struct list_head *cur;
+ struct map_lookup *map;
+ struct extent_map_tree *em_tree;
+ struct extent_map *em;
+ struct btrfs_device_info *devices_info;
+ struct list_head private_devs;
+ u64 calc_size = 1024 * 1024 * 1024;
+ u64 min_free;
+ u64 avail;
+ u64 dev_offset;
+ int num_stripes;
+ int min_stripes;
+ int sub_stripes;
+ int min_devices; /* the min number of devices we need */
+ int i;
+ int ret;
+ int index;
- do_div(calc_size, stripe_len);
- calc_size *= stripe_len;
+ if ((type & BTRFS_BLOCK_GROUP_RAID1) &&
+ (type & BTRFS_BLOCK_GROUP_DUP)) {
+ WARN_ON(1);
+ type &= ~BTRFS_BLOCK_GROUP_DUP;
+ }
+ if (list_empty(&fs_devices->alloc_list))
+ return -ENOSPC;
+
+ ret = __btrfs_calc_nstripes(fs_devices, type, &num_stripes,
+ &min_stripes, &sub_stripes);
+ if (ret)
+ return ret;
+
+ devices_info = kzalloc(sizeof(*devices_info) * fs_devices->rw_devices,
+ GFP_NOFS);
+ if (!devices_info)
+ return -ENOMEM;
+
+ map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
+ if (!map) {
+ ret = -ENOMEM;
+ goto error;
+ }
+ map->num_stripes = num_stripes;
cur = fs_devices->alloc_list.next;
index = 0;
+ i = 0;
- if (type & BTRFS_BLOCK_GROUP_DUP)
+ calc_size = __btrfs_calc_stripe_size(fs_devices, calc_size, type,
+ num_stripes, 0);
+
+ if (type & BTRFS_BLOCK_GROUP_DUP) {
min_free = calc_size * 2;
- else
+ min_devices = 1;
+ } else {
min_free = calc_size;
-
- /*
- * we add 1MB because we never use the first 1MB of the device, unless
- * we've looped, then we are likely allocating the maximum amount of
- * space left already
- */
- if (!looped)
- min_free += 1024 * 1024;
+ min_devices = min_stripes;
+ }
INIT_LIST_HEAD(&private_devs);
while (index < num_stripes) {
@@ -2287,27 +2530,39 @@ again:
cur = cur->next;
if (device->in_fs_metadata && avail >= min_free) {
- ret = find_free_dev_extent(trans, device,
- min_free, &dev_offset,
- &max_avail);
+ ret = find_free_dev_extent(trans, device, min_free,
+ &devices_info[i].dev_offset,
+ &devices_info[i].max_avail);
if (ret == 0) {
list_move_tail(&device->dev_alloc_list,
&private_devs);
map->stripes[index].dev = device;
- map->stripes[index].physical = dev_offset;
+ map->stripes[index].physical =
+ devices_info[i].dev_offset;
index++;
if (type & BTRFS_BLOCK_GROUP_DUP) {
map->stripes[index].dev = device;
map->stripes[index].physical =
- dev_offset + calc_size;
+ devices_info[i].dev_offset +
+ calc_size;
index++;
}
- }
- } else if (device->in_fs_metadata && avail > max_avail)
- max_avail = avail;
+ } else if (ret != -ENOSPC)
+ goto error;
+
+ devices_info[i].dev = device;
+ i++;
+ } else if (device->in_fs_metadata &&
+ avail >= BTRFS_STRIPE_LEN) {
+ devices_info[i].dev = device;
+ devices_info[i].max_avail = avail;
+ i++;
+ }
+
if (cur == &fs_devices->alloc_list)
break;
}
+
list_splice(&private_devs, &fs_devices->alloc_list);
if (index < num_stripes) {
if (index >= min_stripes) {
@@ -2316,34 +2571,38 @@ again:
num_stripes /= sub_stripes;
num_stripes *= sub_stripes;
}
- looped = 1;
- goto again;
- }
- if (!looped && max_avail > 0) {
- looped = 1;
- calc_size = max_avail;
- goto again;
+
+ map = __shrink_map_lookup_stripes(map, num_stripes);
+ } else if (i >= min_devices) {
+ ret = __btrfs_alloc_tiny_space(trans, fs_devices,
+ devices_info, i, type,
+ &map, min_stripes,
+ &calc_size);
+ if (ret)
+ goto error;
+ } else {
+ ret = -ENOSPC;
+ goto error;
}
- kfree(map);
- return -ENOSPC;
}
map->sector_size = extent_root->sectorsize;
- map->stripe_len = stripe_len;
- map->io_align = stripe_len;
- map->io_width = stripe_len;
+ map->stripe_len = BTRFS_STRIPE_LEN;
+ map->io_align = BTRFS_STRIPE_LEN;
+ map->io_width = BTRFS_STRIPE_LEN;
map->type = type;
- map->num_stripes = num_stripes;
map->sub_stripes = sub_stripes;
*map_ret = map;
*stripe_size = calc_size;
*num_bytes = chunk_bytes_by_type(type, calc_size,
- num_stripes, sub_stripes);
+ map->num_stripes, sub_stripes);
+
+ trace_btrfs_chunk_alloc(info->chunk_root, map, start, *num_bytes);
em = alloc_extent_map(GFP_NOFS);
if (!em) {
- kfree(map);
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto error;
}
em->bdev = (struct block_device *)map;
em->start = start;
@@ -2376,7 +2635,13 @@ again:
index++;
}
+ kfree(devices_info);
return 0;
+
+error:
+ kfree(map);
+ kfree(devices_info);
+ return ret;
}
static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
@@ -2442,6 +2707,7 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
item_size);
BUG_ON(ret);
}
+
kfree(chunk);
return 0;
}
@@ -2639,14 +2905,17 @@ static int find_live_mirror(struct map_lookup *map, int first, int num,
static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
u64 logical, u64 *length,
struct btrfs_multi_bio **multi_ret,
- int mirror_num, struct page *unplug_page)
+ int mirror_num)
{
struct extent_map *em;
struct map_lookup *map;
struct extent_map_tree *em_tree = &map_tree->map_tree;
u64 offset;
u64 stripe_offset;
+ u64 stripe_end_offset;
u64 stripe_nr;
+ u64 stripe_nr_orig;
+ u64 stripe_nr_end;
int stripes_allocated = 8;
int stripes_required = 1;
int stripe_index;
@@ -2655,7 +2924,7 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
int max_errors = 0;
struct btrfs_multi_bio *multi = NULL;
- if (multi_ret && !(rw & REQ_WRITE))
+ if (multi_ret && !(rw & (REQ_WRITE | REQ_DISCARD)))
stripes_allocated = 1;
again:
if (multi_ret) {
@@ -2671,11 +2940,6 @@ again:
em = lookup_extent_mapping(em_tree, logical, *length);
read_unlock(&em_tree->lock);
- if (!em && unplug_page) {
- kfree(multi);
- return 0;
- }
-
if (!em) {
printk(KERN_CRIT "unable to find logical %llu len %llu\n",
(unsigned long long)logical,
@@ -2701,7 +2965,15 @@ again:
max_errors = 1;
}
}
- if (multi_ret && (rw & REQ_WRITE) &&
+ if (rw & REQ_DISCARD) {
+ if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
+ BTRFS_BLOCK_GROUP_RAID1 |
+ BTRFS_BLOCK_GROUP_DUP |
+ BTRFS_BLOCK_GROUP_RAID10)) {
+ stripes_required = map->num_stripes;
+ }
+ }
+ if (multi_ret && (rw & (REQ_WRITE | REQ_DISCARD)) &&
stripes_allocated < stripes_required) {
stripes_allocated = map->num_stripes;
free_extent_map(em);
@@ -2721,23 +2993,37 @@ again:
/* stripe_offset is the offset of this block in its stripe*/
stripe_offset = offset - stripe_offset;
- if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
- BTRFS_BLOCK_GROUP_RAID10 |
- BTRFS_BLOCK_GROUP_DUP)) {
+ if (rw & REQ_DISCARD)
+ *length = min_t(u64, em->len - offset, *length);
+ else if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
+ BTRFS_BLOCK_GROUP_RAID1 |
+ BTRFS_BLOCK_GROUP_RAID10 |
+ BTRFS_BLOCK_GROUP_DUP)) {
/* we limit the length of each bio to what fits in a stripe */
*length = min_t(u64, em->len - offset,
- map->stripe_len - stripe_offset);
+ map->stripe_len - stripe_offset);
} else {
*length = em->len - offset;
}
- if (!multi_ret && !unplug_page)
+ if (!multi_ret)
goto out;
num_stripes = 1;
stripe_index = 0;
- if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
- if (unplug_page || (rw & REQ_WRITE))
+ stripe_nr_orig = stripe_nr;
+ stripe_nr_end = (offset + *length + map->stripe_len - 1) &
+ (~(map->stripe_len - 1));
+ do_div(stripe_nr_end, map->stripe_len);
+ stripe_end_offset = stripe_nr_end * map->stripe_len -
+ (offset + *length);
+ if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
+ if (rw & REQ_DISCARD)
+ num_stripes = min_t(u64, map->num_stripes,
+ stripe_nr_end - stripe_nr_orig);
+ stripe_index = do_div(stripe_nr, map->num_stripes);
+ } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
+ if (rw & (REQ_WRITE | REQ_DISCARD))
num_stripes = map->num_stripes;
else if (mirror_num)
stripe_index = mirror_num - 1;
@@ -2748,7 +3034,7 @@ again:
}
} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
- if (rw & REQ_WRITE)
+ if (rw & (REQ_WRITE | REQ_DISCARD))
num_stripes = map->num_stripes;
else if (mirror_num)
stripe_index = mirror_num - 1;
@@ -2759,8 +3045,12 @@ again:
stripe_index = do_div(stripe_nr, factor);
stripe_index *= map->sub_stripes;
- if (unplug_page || (rw & REQ_WRITE))
+ if (rw & REQ_WRITE)
num_stripes = map->sub_stripes;
+ else if (rw & REQ_DISCARD)
+ num_stripes = min_t(u64, map->sub_stripes *
+ (stripe_nr_end - stripe_nr_orig),
+ map->num_stripes);
else if (mirror_num)
stripe_index += mirror_num - 1;
else {
@@ -2778,24 +3068,101 @@ again:
}
BUG_ON(stripe_index >= map->num_stripes);
- for (i = 0; i < num_stripes; i++) {
- if (unplug_page) {
- struct btrfs_device *device;
- struct backing_dev_info *bdi;
-
- device = map->stripes[stripe_index].dev;
- if (device->bdev) {
- bdi = blk_get_backing_dev_info(device->bdev);
- if (bdi->unplug_io_fn)
- bdi->unplug_io_fn(bdi, unplug_page);
- }
- } else {
+ if (rw & REQ_DISCARD) {
+ for (i = 0; i < num_stripes; i++) {
multi->stripes[i].physical =
map->stripes[stripe_index].physical +
stripe_offset + stripe_nr * map->stripe_len;
multi->stripes[i].dev = map->stripes[stripe_index].dev;
+
+ if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
+ u64 stripes;
+ u32 last_stripe = 0;
+ int j;
+
+ div_u64_rem(stripe_nr_end - 1,
+ map->num_stripes,
+ &last_stripe);
+
+ for (j = 0; j < map->num_stripes; j++) {
+ u32 test;
+
+ div_u64_rem(stripe_nr_end - 1 - j,
+ map->num_stripes, &test);
+ if (test == stripe_index)
+ break;
+ }
+ stripes = stripe_nr_end - 1 - j;
+ do_div(stripes, map->num_stripes);
+ multi->stripes[i].length = map->stripe_len *
+ (stripes - stripe_nr + 1);
+
+ if (i == 0) {
+ multi->stripes[i].length -=
+ stripe_offset;
+ stripe_offset = 0;
+ }
+ if (stripe_index == last_stripe)
+ multi->stripes[i].length -=
+ stripe_end_offset;
+ } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
+ u64 stripes;
+ int j;
+ int factor = map->num_stripes /
+ map->sub_stripes;
+ u32 last_stripe = 0;
+
+ div_u64_rem(stripe_nr_end - 1,
+ factor, &last_stripe);
+ last_stripe *= map->sub_stripes;
+
+ for (j = 0; j < factor; j++) {
+ u32 test;
+
+ div_u64_rem(stripe_nr_end - 1 - j,
+ factor, &test);
+
+ if (test ==
+ stripe_index / map->sub_stripes)
+ break;
+ }
+ stripes = stripe_nr_end - 1 - j;
+ do_div(stripes, factor);
+ multi->stripes[i].length = map->stripe_len *
+ (stripes - stripe_nr + 1);
+
+ if (i < map->sub_stripes) {
+ multi->stripes[i].length -=
+ stripe_offset;
+ if (i == map->sub_stripes - 1)
+ stripe_offset = 0;
+ }
+ if (stripe_index >= last_stripe &&
+ stripe_index <= (last_stripe +
+ map->sub_stripes - 1)) {
+ multi->stripes[i].length -=
+ stripe_end_offset;
+ }
+ } else
+ multi->stripes[i].length = *length;
+
+ stripe_index++;
+ if (stripe_index == map->num_stripes) {
+ /* This could only happen for RAID0/10 */
+ stripe_index = 0;
+ stripe_nr++;
+ }
+ }
+ } else {
+ for (i = 0; i < num_stripes; i++) {
+ multi->stripes[i].physical =
+ map->stripes[stripe_index].physical +
+ stripe_offset +
+ stripe_nr * map->stripe_len;
+ multi->stripes[i].dev =
+ map->stripes[stripe_index].dev;
+ stripe_index++;
}
- stripe_index++;
}
if (multi_ret) {
*multi_ret = multi;
@@ -2812,7 +3179,7 @@ int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
struct btrfs_multi_bio **multi_ret, int mirror_num)
{
return __btrfs_map_block(map_tree, rw, logical, length, multi_ret,
- mirror_num, NULL);
+ mirror_num);
}
int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
@@ -2880,14 +3247,6 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
return 0;
}
-int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree,
- u64 logical, struct page *page)
-{
- u64 length = PAGE_CACHE_SIZE;
- return __btrfs_map_block(map_tree, READ, logical, &length,
- NULL, 0, page);
-}
-
static void end_bio_multi_stripe(struct bio *bio, int err)
{
struct btrfs_multi_bio *multi = bio->bi_private;