From d5e2003c2bcda93a8f2e668eb4642d70c9c38301 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 4 Aug 2011 14:52:27 +0000 Subject: Btrfs: detect wether a device supports discard We have a problem where if a user specifies discard but doesn't actually support it we will return EOPNOTSUPP from btrfs_discard_extent. This is a problem because this gets called (in a fashion) from the tree log recovery code, which has a nice little BUG_ON(ret) after it, which causes us to fail the tree log replay. So instead detect wether our devices support discard when we're adding them and then don't issue discards if we know that the device doesn't support it. And just for good measure set ret = 0 in btrfs_issue_discard just in case we still get EOPNOTSUPP so we don't screw anybody up like this again. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 66bac226944e..059dfa048cc0 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1782,6 +1782,9 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, for (i = 0; i < multi->num_stripes; i++, stripe++) { + if (!stripe->dev->can_discard) + continue; + ret = btrfs_issue_discard(stripe->dev->bdev, stripe->physical, stripe->length); @@ -1789,11 +1792,16 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, discarded_bytes += stripe->length; else if (ret != -EOPNOTSUPP) break; + + /* + * Just in case we get back EOPNOTSUPP for some reason, + * just ignore the return value so we don't screw up + * people calling discard_extent. + */ + ret = 0; } kfree(multi); } - if (discarded_bytes && ret == -EOPNOTSUPP) - ret = 0; if (actual_bytes) *actual_bytes = discarded_bytes; -- cgit v1.2.3 From cdcb725c05fe0cb71777c66ddc2445fedbbb3c59 Mon Sep 17 00:00:00 2001 From: liubo Date: Wed, 3 Aug 2011 10:15:25 +0000 Subject: Btrfs: check if there is enough space for balancing smarter When checking if there is enough space for balancing a block group, since we do not take raid types into consideration, we do not account corrent amounts of space that we needed. This makes us do some extra work before we get ENOSPC. Signed-off-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 41 +++++++++++++++++++++++++++++++++++------ 1 file changed, 35 insertions(+), 6 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 059dfa048cc0..a3e71b59f66e 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -6728,6 +6728,10 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) struct btrfs_space_info *space_info; struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; struct btrfs_device *device; + u64 min_free; + int index; + int dev_nr = 0; + int dev_min = 1; int full = 0; int ret = 0; @@ -6737,8 +6741,10 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) if (!block_group) return -1; + min_free = btrfs_block_group_used(&block_group->item); + /* no bytes used, we're good */ - if (!btrfs_block_group_used(&block_group->item)) + if (!min_free) goto out; space_info = block_group->space_info; @@ -6754,10 +6760,9 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) * all of the extents from this block group. If we can, we're good */ if ((space_info->total_bytes != block_group->key.offset) && - (space_info->bytes_used + space_info->bytes_reserved + - space_info->bytes_pinned + space_info->bytes_readonly + - btrfs_block_group_used(&block_group->item) < - space_info->total_bytes)) { + (space_info->bytes_used + space_info->bytes_reserved + + space_info->bytes_pinned + space_info->bytes_readonly + + min_free < space_info->total_bytes)) { spin_unlock(&space_info->lock); goto out; } @@ -6774,9 +6779,29 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) if (full) goto out; + /* + * index: + * 0: raid10 + * 1: raid1 + * 2: dup + * 3: raid0 + * 4: single + */ + index = get_block_group_index(block_group); + if (index == 0) { + dev_min = 4; + min_free /= 2; + } else if (index == 1) { + dev_min = 2; + } else if (index == 2) { + min_free *= 2; + } else if (index == 3) { + dev_min = fs_devices->rw_devices; + min_free /= dev_min; + } + mutex_lock(&root->fs_info->chunk_mutex); list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { - u64 min_free = btrfs_block_group_used(&block_group->item); u64 dev_offset; /* @@ -6787,7 +6812,11 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) ret = find_free_dev_extent(NULL, device, min_free, &dev_offset, NULL); if (!ret) + dev_nr++; + + if (dev_nr >= dev_min) break; + ret = -1; } } -- cgit v1.2.3 From cb1b69f4508a1e8c1a7907379eafceb7ae0325ef Mon Sep 17 00:00:00 2001 From: Tsutomu Itoh Date: Tue, 9 Aug 2011 07:11:13 +0000 Subject: Btrfs: forced readonly when btrfs_drop_snapshot() fails The filesystem turns readonly instead of returning the error to the caller when detected error in btrfs_drop_snapshot(). and, because the caller doesn't check the error, the function type is changed to 'void'. Signed-off-by: Tsutomu Itoh Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 4 ++-- fs/btrfs/extent-tree.c | 22 ++++++++++++++-------- 2 files changed, 16 insertions(+), 10 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index a6263bdab818..884293642a6c 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2367,8 +2367,8 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans, int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path); int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path); int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf); -int btrfs_drop_snapshot(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, int update_ref); +void btrfs_drop_snapshot(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, int update_ref); int btrfs_drop_subtree(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *node, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index a3e71b59f66e..80d6148f60ac 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -6277,8 +6277,8 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans, * also make sure backrefs for the shared block and all lower level * blocks are properly updated. */ -int btrfs_drop_snapshot(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, int update_ref) +void btrfs_drop_snapshot(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, int update_ref) { struct btrfs_path *path; struct btrfs_trans_handle *trans; @@ -6291,13 +6291,16 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int level; path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; + if (!path) { + err = -ENOMEM; + goto out; + } wc = kzalloc(sizeof(*wc), GFP_NOFS); if (!wc) { btrfs_free_path(path); - return -ENOMEM; + err = -ENOMEM; + goto out; } trans = btrfs_start_transaction(tree_root, 0); @@ -6326,7 +6329,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, path->lowest_level = 0; if (ret < 0) { err = ret; - goto out; + goto out_free; } WARN_ON(ret > 0); @@ -6433,11 +6436,14 @@ int btrfs_drop_snapshot(struct btrfs_root *root, free_extent_buffer(root->commit_root); kfree(root); } -out: +out_free: btrfs_end_transaction_throttle(trans, tree_root); kfree(wc); btrfs_free_path(path); - return err; +out: + if (err) + btrfs_std_error(root->fs_info, err); + return; } /* -- cgit v1.2.3 From 6719db6a23d4b7f1e5052eedae394135e3aef9c1 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Sat, 20 Aug 2011 08:29:51 -0400 Subject: Btrfs: fix 64 bit divide problem This fixes a regression introduced by commit cdcb725c05fe ("Btrfs: check if there is enough space for balancing smarter"). We can't do 64-bit divides on 32-bit architectures. In cases where we need to divide/multiply by 2 we should just left/right shift respectively, and in cases where theres N number of devices use do_div. Also make the counters u64 to match up with rw_devices. Thanks, Signed-off-by: Josef Bacik Acked-and-tested-by: Ingo Molnar Signed-off-by: Linus Torvalds --- fs/btrfs/extent-tree.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 80d6148f60ac..f5be06a2462f 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -6735,9 +6735,9 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; struct btrfs_device *device; u64 min_free; + u64 dev_min = 1; + u64 dev_nr = 0; int index; - int dev_nr = 0; - int dev_min = 1; int full = 0; int ret = 0; @@ -6796,14 +6796,16 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) index = get_block_group_index(block_group); if (index == 0) { dev_min = 4; - min_free /= 2; + /* Divide by 2 */ + min_free >>= 1; } else if (index == 1) { dev_min = 2; } else if (index == 2) { - min_free *= 2; + /* Multiply by 2 */ + min_free <<= 1; } else if (index == 3) { dev_min = fs_devices->rw_devices; - min_free /= dev_min; + do_div(min_free, dev_min); } mutex_lock(&root->fs_info->chunk_mutex); -- cgit v1.2.3 From a1d3c4786a4b9c71c0767aa656a759968f7554b6 Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Thu, 4 Aug 2011 17:15:33 +0200 Subject: btrfs: btrfs_multi_bio replaced with btrfs_bio btrfs_bio is a bio abstraction able to split and not complete after the last bio has returned (like the old btrfs_multi_bio). Additionally, btrfs_bio tracks the mirror_num used to read data which can be used for error correction purposes. Signed-off-by: Jan Schmidt --- fs/btrfs/extent-tree.c | 10 ++-- fs/btrfs/scrub.c | 20 ++++---- fs/btrfs/volumes.c | 128 ++++++++++++++++++++++++++----------------------- fs/btrfs/volumes.h | 10 ++-- 4 files changed, 90 insertions(+), 78 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index f5be06a2462f..119f842c1d4f 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1770,18 +1770,18 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, { int ret; u64 discarded_bytes = 0; - struct btrfs_multi_bio *multi = NULL; + struct btrfs_bio *bbio = NULL; /* Tell the block device(s) that the sectors can be discarded */ ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD, - bytenr, &num_bytes, &multi, 0); + bytenr, &num_bytes, &bbio, 0); if (!ret) { - struct btrfs_bio_stripe *stripe = multi->stripes; + struct btrfs_bio_stripe *stripe = bbio->stripes; int i; - for (i = 0; i < multi->num_stripes; i++, stripe++) { + for (i = 0; i < bbio->num_stripes; i++, stripe++) { if (!stripe->dev->can_discard) continue; @@ -1800,7 +1800,7 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, */ ret = 0; } - kfree(multi); + kfree(bbio); } if (actual_bytes) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index db09f01c0e4f..97142a218f0a 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -572,7 +572,7 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix) struct scrub_dev *sdev = sbio->sdev; struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info; struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; - struct btrfs_multi_bio *multi = NULL; + struct btrfs_bio *bbio = NULL; struct scrub_fixup_nodatasum *fixup; u64 logical = sbio->logical + ix * PAGE_SIZE; u64 length; @@ -610,8 +610,8 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix) length = PAGE_SIZE; ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length, - &multi, 0); - if (ret || !multi || length < PAGE_SIZE) { + &bbio, 0); + if (ret || !bbio || length < PAGE_SIZE) { printk(KERN_ERR "scrub_fixup: btrfs_map_block failed us for %llu\n", (unsigned long long)logical); @@ -619,19 +619,19 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix) return; } - if (multi->num_stripes == 1) + if (bbio->num_stripes == 1) /* there aren't any replicas */ goto uncorrectable; /* * first find a good copy */ - for (i = 0; i < multi->num_stripes; ++i) { + for (i = 0; i < bbio->num_stripes; ++i) { if (i + 1 == sbio->spag[ix].mirror_num) continue; - if (scrub_fixup_io(READ, multi->stripes[i].dev->bdev, - multi->stripes[i].physical >> 9, + if (scrub_fixup_io(READ, bbio->stripes[i].dev->bdev, + bbio->stripes[i].physical >> 9, sbio->bio->bi_io_vec[ix].bv_page)) { /* I/O-error, this is not a good copy */ continue; @@ -640,7 +640,7 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix) if (scrub_fixup_check(sbio, ix) == 0) break; } - if (i == multi->num_stripes) + if (i == bbio->num_stripes) goto uncorrectable; if (!sdev->readonly) { @@ -655,7 +655,7 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix) } } - kfree(multi); + kfree(bbio); spin_lock(&sdev->stat_lock); ++sdev->stat.corrected_errors; spin_unlock(&sdev->stat_lock); @@ -665,7 +665,7 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix) return; uncorrectable: - kfree(multi); + kfree(bbio); spin_lock(&sdev->stat_lock); ++sdev->stat.uncorrectable_errors; spin_unlock(&sdev->stat_lock); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index f2a4cc79da61..9db4d7962f9b 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2848,7 +2848,7 @@ static int find_live_mirror(struct map_lookup *map, int first, int num, static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, u64 logical, u64 *length, - struct btrfs_multi_bio **multi_ret, + struct btrfs_bio **bbio_ret, int mirror_num) { struct extent_map *em; @@ -2866,18 +2866,18 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, int i; int num_stripes; int max_errors = 0; - struct btrfs_multi_bio *multi = NULL; + struct btrfs_bio *bbio = NULL; - if (multi_ret && !(rw & (REQ_WRITE | REQ_DISCARD))) + if (bbio_ret && !(rw & (REQ_WRITE | REQ_DISCARD))) stripes_allocated = 1; again: - if (multi_ret) { - multi = kzalloc(btrfs_multi_bio_size(stripes_allocated), + if (bbio_ret) { + bbio = kzalloc(btrfs_bio_size(stripes_allocated), GFP_NOFS); - if (!multi) + if (!bbio) return -ENOMEM; - atomic_set(&multi->error, 0); + atomic_set(&bbio->error, 0); } read_lock(&em_tree->lock); @@ -2898,7 +2898,7 @@ again: if (mirror_num > map->num_stripes) mirror_num = 0; - /* if our multi bio struct is too small, back off and try again */ + /* if our btrfs_bio struct is too small, back off and try again */ if (rw & REQ_WRITE) { if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_DUP)) { @@ -2917,11 +2917,11 @@ again: stripes_required = map->num_stripes; } } - if (multi_ret && (rw & (REQ_WRITE | REQ_DISCARD)) && + if (bbio_ret && (rw & (REQ_WRITE | REQ_DISCARD)) && stripes_allocated < stripes_required) { stripes_allocated = map->num_stripes; free_extent_map(em); - kfree(multi); + kfree(bbio); goto again; } stripe_nr = offset; @@ -2950,7 +2950,7 @@ again: *length = em->len - offset; } - if (!multi_ret) + if (!bbio_ret) goto out; num_stripes = 1; @@ -2975,13 +2975,17 @@ again: stripe_index = find_live_mirror(map, 0, map->num_stripes, current->pid % map->num_stripes); + mirror_num = stripe_index + 1; } } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { - if (rw & (REQ_WRITE | REQ_DISCARD)) + if (rw & (REQ_WRITE | REQ_DISCARD)) { num_stripes = map->num_stripes; - else if (mirror_num) + } else if (mirror_num) { stripe_index = mirror_num - 1; + } else { + mirror_num = 1; + } } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { int factor = map->num_stripes / map->sub_stripes; @@ -3001,6 +3005,7 @@ again: stripe_index = find_live_mirror(map, stripe_index, map->sub_stripes, stripe_index + current->pid % map->sub_stripes); + mirror_num = stripe_index + 1; } } else { /* @@ -3009,15 +3014,16 @@ again: * stripe_index is the number of our device in the stripe array */ stripe_index = do_div(stripe_nr, map->num_stripes); + mirror_num = stripe_index + 1; } BUG_ON(stripe_index >= map->num_stripes); if (rw & REQ_DISCARD) { for (i = 0; i < num_stripes; i++) { - multi->stripes[i].physical = + bbio->stripes[i].physical = map->stripes[stripe_index].physical + stripe_offset + stripe_nr * map->stripe_len; - multi->stripes[i].dev = map->stripes[stripe_index].dev; + bbio->stripes[i].dev = map->stripes[stripe_index].dev; if (map->type & BTRFS_BLOCK_GROUP_RAID0) { u64 stripes; @@ -3038,16 +3044,16 @@ again: } stripes = stripe_nr_end - 1 - j; do_div(stripes, map->num_stripes); - multi->stripes[i].length = map->stripe_len * + bbio->stripes[i].length = map->stripe_len * (stripes - stripe_nr + 1); if (i == 0) { - multi->stripes[i].length -= + bbio->stripes[i].length -= stripe_offset; stripe_offset = 0; } if (stripe_index == last_stripe) - multi->stripes[i].length -= + bbio->stripes[i].length -= stripe_end_offset; } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { u64 stripes; @@ -3072,11 +3078,11 @@ again: } stripes = stripe_nr_end - 1 - j; do_div(stripes, factor); - multi->stripes[i].length = map->stripe_len * + bbio->stripes[i].length = map->stripe_len * (stripes - stripe_nr + 1); if (i < map->sub_stripes) { - multi->stripes[i].length -= + bbio->stripes[i].length -= stripe_offset; if (i == map->sub_stripes - 1) stripe_offset = 0; @@ -3084,11 +3090,11 @@ again: if (stripe_index >= last_stripe && stripe_index <= (last_stripe + map->sub_stripes - 1)) { - multi->stripes[i].length -= + bbio->stripes[i].length -= stripe_end_offset; } } else - multi->stripes[i].length = *length; + bbio->stripes[i].length = *length; stripe_index++; if (stripe_index == map->num_stripes) { @@ -3099,19 +3105,20 @@ again: } } else { for (i = 0; i < num_stripes; i++) { - multi->stripes[i].physical = + bbio->stripes[i].physical = map->stripes[stripe_index].physical + stripe_offset + stripe_nr * map->stripe_len; - multi->stripes[i].dev = + bbio->stripes[i].dev = map->stripes[stripe_index].dev; stripe_index++; } } - if (multi_ret) { - *multi_ret = multi; - multi->num_stripes = num_stripes; - multi->max_errors = max_errors; + if (bbio_ret) { + *bbio_ret = bbio; + bbio->num_stripes = num_stripes; + bbio->max_errors = max_errors; + bbio->mirror_num = mirror_num; } out: free_extent_map(em); @@ -3120,9 +3127,9 @@ out: int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, u64 logical, u64 *length, - struct btrfs_multi_bio **multi_ret, int mirror_num) + struct btrfs_bio **bbio_ret, int mirror_num) { - return __btrfs_map_block(map_tree, rw, logical, length, multi_ret, + return __btrfs_map_block(map_tree, rw, logical, length, bbio_ret, mirror_num); } @@ -3191,28 +3198,28 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, return 0; } -static void end_bio_multi_stripe(struct bio *bio, int err) +static void btrfs_end_bio(struct bio *bio, int err) { - struct btrfs_multi_bio *multi = bio->bi_private; + struct btrfs_bio *bbio = bio->bi_private; int is_orig_bio = 0; if (err) - atomic_inc(&multi->error); + atomic_inc(&bbio->error); - if (bio == multi->orig_bio) + if (bio == bbio->orig_bio) is_orig_bio = 1; - if (atomic_dec_and_test(&multi->stripes_pending)) { + if (atomic_dec_and_test(&bbio->stripes_pending)) { if (!is_orig_bio) { bio_put(bio); - bio = multi->orig_bio; + bio = bbio->orig_bio; } - bio->bi_private = multi->private; - bio->bi_end_io = multi->end_io; + bio->bi_private = bbio->private; + bio->bi_end_io = bbio->end_io; /* only send an error to the higher layers if it is * beyond the tolerance of the multi-bio */ - if (atomic_read(&multi->error) > multi->max_errors) { + if (atomic_read(&bbio->error) > bbio->max_errors) { err = -EIO; } else if (err) { /* @@ -3222,7 +3229,7 @@ static void end_bio_multi_stripe(struct bio *bio, int err) set_bit(BIO_UPTODATE, &bio->bi_flags); err = 0; } - kfree(multi); + kfree(bbio); bio_endio(bio, err); } else if (!is_orig_bio) { @@ -3302,20 +3309,20 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, u64 logical = (u64)bio->bi_sector << 9; u64 length = 0; u64 map_length; - struct btrfs_multi_bio *multi = NULL; int ret; int dev_nr = 0; int total_devs = 1; + struct btrfs_bio *bbio = NULL; length = bio->bi_size; map_tree = &root->fs_info->mapping_tree; map_length = length; - ret = btrfs_map_block(map_tree, rw, logical, &map_length, &multi, + ret = btrfs_map_block(map_tree, rw, logical, &map_length, &bbio, mirror_num); BUG_ON(ret); - total_devs = multi->num_stripes; + total_devs = bbio->num_stripes; if (map_length < length) { printk(KERN_CRIT "mapping failed logical %llu bio len %llu " "len %llu\n", (unsigned long long)logical, @@ -3323,25 +3330,28 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, (unsigned long long)map_length); BUG(); } - multi->end_io = first_bio->bi_end_io; - multi->private = first_bio->bi_private; - multi->orig_bio = first_bio; - atomic_set(&multi->stripes_pending, multi->num_stripes); + + bbio->orig_bio = first_bio; + bbio->private = first_bio->bi_private; + bbio->end_io = first_bio->bi_end_io; + atomic_set(&bbio->stripes_pending, bbio->num_stripes); while (dev_nr < total_devs) { - if (total_devs > 1) { - if (dev_nr < total_devs - 1) { - bio = bio_clone(first_bio, GFP_NOFS); - BUG_ON(!bio); - } else { - bio = first_bio; - } - bio->bi_private = multi; - bio->bi_end_io = end_bio_multi_stripe; + if (dev_nr < total_devs - 1) { + bio = bio_clone(first_bio, GFP_NOFS); + BUG_ON(!bio); + } else { + bio = first_bio; } - bio->bi_sector = multi->stripes[dev_nr].physical >> 9; - dev = multi->stripes[dev_nr].dev; + bio->bi_private = bbio; + bio->bi_end_io = btrfs_end_bio; + bio->bi_sector = bbio->stripes[dev_nr].physical >> 9; + dev = bbio->stripes[dev_nr].dev; if (dev && dev->bdev && (rw != WRITE || dev->writeable)) { + pr_debug("btrfs_map_bio: rw %d, secor=%llu, dev=%lu " + "(%s id %llu), size=%u\n", rw, + (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev, + dev->name, dev->devid, bio->bi_size); bio->bi_bdev = dev->bdev; if (async_submit) schedule_bio(root, dev, rw, bio); @@ -3354,8 +3364,6 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, } dev_nr++; } - if (total_devs == 1) - kfree(multi); return 0; } diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 6d866db4e177..71f4f3f67495 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -136,7 +136,10 @@ struct btrfs_bio_stripe { u64 length; /* only used for discard mappings */ }; -struct btrfs_multi_bio { +struct btrfs_bio; +typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err); + +struct btrfs_bio { atomic_t stripes_pending; bio_end_io_t *end_io; struct bio *orig_bio; @@ -144,6 +147,7 @@ struct btrfs_multi_bio { atomic_t error; int max_errors; int num_stripes; + int mirror_num; struct btrfs_bio_stripe stripes[]; }; @@ -171,7 +175,7 @@ struct map_lookup { int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, u64 end, u64 *length); -#define btrfs_multi_bio_size(n) (sizeof(struct btrfs_multi_bio) + \ +#define btrfs_bio_size(n) (sizeof(struct btrfs_bio) + \ (sizeof(struct btrfs_bio_stripe) * (n))) int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, @@ -180,7 +184,7 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, u64 chunk_offset, u64 start, u64 num_bytes); int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, u64 logical, u64 *length, - struct btrfs_multi_bio **multi_ret, int mirror_num); + struct btrfs_bio **bbio_ret, int mirror_num); int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, u64 chunk_start, u64 physical, u64 devid, u64 **logical, int *naddrs, int *stripe_len); -- cgit v1.2.3 From 0cbbdf7c9c46467bfb7129c30236f36a679ab244 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 14 Jul 2011 16:02:04 -0400 Subject: Btrfs: kill reserved_bytes in inode reserved_bytes is not used for anything in the inode, remove it. Signed-off-by: Josef Bacik --- fs/btrfs/btrfs_inode.h | 5 ----- fs/btrfs/extent-tree.c | 2 -- fs/btrfs/inode.c | 1 - 3 files changed, 8 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index bf325f40cf92..c70fb10a307b 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -103,11 +103,6 @@ struct btrfs_inode { */ u64 delalloc_bytes; - /* total number of bytes that may be used for this inode for - * delalloc - */ - u64 reserved_bytes; - /* * the size of the file stored in the metadata on disk. data=ordered * means the in-memory i_size might be larger than the size on disk diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index f5be06a2462f..03edac4f7771 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3122,7 +3122,6 @@ commit_trans: return -ENOSPC; } data_sinfo->bytes_may_use += bytes; - BTRFS_I(inode)->reserved_bytes += bytes; spin_unlock(&data_sinfo->lock); return 0; @@ -3144,7 +3143,6 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes) data_sinfo = BTRFS_I(inode)->space_info; spin_lock(&data_sinfo->lock); data_sinfo->bytes_may_use -= bytes; - BTRFS_I(inode)->reserved_bytes -= bytes; spin_unlock(&data_sinfo->lock); } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index b2d004ad66a0..156c3a0da792 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6755,7 +6755,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->last_sub_trans = 0; ei->logged_trans = 0; ei->delalloc_bytes = 0; - ei->reserved_bytes = 0; ei->disk_i_size = 0; ei->flags = 0; ei->index_cnt = (u64)-1; -- cgit v1.2.3 From fb25e9141ab843794d5cdef3936ccb58435e2371 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 26 Jul 2011 17:00:46 -0400 Subject: Btrfs: use bytes_may_use for all ENOSPC reservations We have been using bytes_reserved for metadata reservations, which is wrong since we use that to keep track of outstanding reservations from the allocator. This resulted in us doing a lot of silly things to make sure we don't allocate a bunch of metadata chunks since we never had a real view of how much space was actually in use by metadata. This passes Arne's enospc test and xfstests as well as my own enospc tests. Hopefully this will get us moving in the right direction. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/ctree.h | 2 - fs/btrfs/extent-tree.c | 163 ++++++++++++++++++++++++-------------------- fs/btrfs/free-space-cache.c | 29 ++++++-- 3 files changed, 112 insertions(+), 82 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 03912c5c6f49..332cbdc86ad4 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2196,8 +2196,6 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, u64 root_objectid, u64 owner, u64 offset); int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len); -int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, - u64 num_bytes, int reserve, int sinfo); int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 03edac4f7771..fbe6278f466b 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -52,6 +52,21 @@ enum { CHUNK_ALLOC_LIMITED = 2, }; +/* + * Control how reservations are dealt with. + * + * RESERVE_FREE - freeing a reservation. + * RESERVE_ALLOC - allocating space and we need to update bytes_may_use for + * ENOSPC accounting + * RESERVE_ALLOC_NO_ACCOUNT - allocating space and we should not update + * bytes_may_use as the ENOSPC accounting is done elsewhere + */ +enum { + RESERVE_FREE = 0, + RESERVE_ALLOC = 1, + RESERVE_ALLOC_NO_ACCOUNT = 2, +}; + static int update_block_group(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, int alloc); @@ -81,6 +96,8 @@ static int find_next_key(struct btrfs_path *path, int level, struct btrfs_key *key); static void dump_space_info(struct btrfs_space_info *info, u64 bytes, int dump_block_groups); +static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, + u64 num_bytes, int reserve); static noinline int block_group_cache_done(struct btrfs_block_group_cache *cache) @@ -3128,9 +3145,7 @@ commit_trans: } /* - * called when we are clearing an delalloc extent from the - * inode's io_tree or there was an error for whatever reason - * after calling btrfs_check_data_free_space + * Called if we need to clear a data reservation for this inode. */ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes) { @@ -3163,6 +3178,7 @@ static int should_alloc_chunk(struct btrfs_root *root, struct btrfs_space_info *sinfo, u64 alloc_bytes, int force) { + struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly; u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved; u64 thresh; @@ -3170,6 +3186,13 @@ static int should_alloc_chunk(struct btrfs_root *root, if (force == CHUNK_ALLOC_FORCE) return 1; + /* + * We need to take into account the global rsv because for all intents + * and purposes it's used space. Don't worry about locking the + * global_rsv, it doesn't change except when the transaction commits. + */ + num_allocated += global_rsv->size; + /* * in limited mode, we want to have some free space up to * about 1% of the FS size. @@ -3317,7 +3340,7 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, space_info = block_rsv->space_info; smp_mb(); - reserved = space_info->bytes_reserved; + reserved = space_info->bytes_may_use; progress = space_info->reservation_progress; if (reserved == 0) @@ -3341,9 +3364,9 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages); spin_lock(&space_info->lock); - if (reserved > space_info->bytes_reserved) - reclaimed += reserved - space_info->bytes_reserved; - reserved = space_info->bytes_reserved; + if (reserved > space_info->bytes_may_use) + reclaimed += reserved - space_info->bytes_may_use; + reserved = space_info->bytes_may_use; spin_unlock(&space_info->lock); loops++; @@ -3401,7 +3424,6 @@ static int reserve_metadata_bytes(struct btrfs_trans_handle *trans, int ret = 0; bool committed = false; bool flushing = false; - again: ret = 0; spin_lock(&space_info->lock); @@ -3443,7 +3465,7 @@ again: if (unused <= space_info->total_bytes) { unused = space_info->total_bytes - unused; if (unused >= num_bytes) { - space_info->bytes_reserved += orig_bytes; + space_info->bytes_may_use += orig_bytes; ret = 0; } else { /* @@ -3614,7 +3636,7 @@ static void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv, } if (num_bytes) { spin_lock(&space_info->lock); - space_info->bytes_reserved -= num_bytes; + space_info->bytes_may_use -= num_bytes; space_info->reservation_progress++; spin_unlock(&space_info->lock); } @@ -3825,12 +3847,12 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info) if (sinfo->total_bytes > num_bytes) { num_bytes = sinfo->total_bytes - num_bytes; block_rsv->reserved += num_bytes; - sinfo->bytes_reserved += num_bytes; + sinfo->bytes_may_use += num_bytes; } if (block_rsv->reserved >= block_rsv->size) { num_bytes = block_rsv->reserved - block_rsv->size; - sinfo->bytes_reserved -= num_bytes; + sinfo->bytes_may_use -= num_bytes; sinfo->reservation_progress++; block_rsv->reserved = block_rsv->size; block_rsv->full = 1; @@ -4133,7 +4155,6 @@ static int update_block_group(struct btrfs_trans_handle *trans, btrfs_set_block_group_used(&cache->item, old_val); cache->reserved -= num_bytes; cache->space_info->bytes_reserved -= num_bytes; - cache->space_info->reservation_progress++; cache->space_info->bytes_used += num_bytes; cache->space_info->disk_used += num_bytes * factor; spin_unlock(&cache->lock); @@ -4185,7 +4206,6 @@ static int pin_down_extent(struct btrfs_root *root, if (reserved) { cache->reserved -= num_bytes; cache->space_info->bytes_reserved -= num_bytes; - cache->space_info->reservation_progress++; } spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); @@ -4212,46 +4232,55 @@ int btrfs_pin_extent(struct btrfs_root *root, return 0; } -/* - * update size of reserved extents. this function may return -EAGAIN - * if 'reserve' is true or 'sinfo' is false. +/** + * btrfs_update_reserved_bytes - update the block_group and space info counters + * @cache: The cache we are manipulating + * @num_bytes: The number of bytes in question + * @reserve: One of the reservation enums + * + * This is called by the allocator when it reserves space, or by somebody who is + * freeing space that was never actually used on disk. For example if you + * reserve some space for a new leaf in transaction A and before transaction A + * commits you free that leaf, you call this with reserve set to 0 in order to + * clear the reservation. + * + * Metadata reservations should be called with RESERVE_ALLOC so we do the proper + * ENOSPC accounting. For data we handle the reservation through clearing the + * delalloc bits in the io_tree. We have to do this since we could end up + * allocating less disk space for the amount of data we have reserved in the + * case of compression. + * + * If this is a reservation and the block group has become read only we cannot + * make the reservation and return -EAGAIN, otherwise this function always + * succeeds. */ -int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, - u64 num_bytes, int reserve, int sinfo) +static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, + u64 num_bytes, int reserve) { + struct btrfs_space_info *space_info = cache->space_info; int ret = 0; - if (sinfo) { - struct btrfs_space_info *space_info = cache->space_info; - spin_lock(&space_info->lock); - spin_lock(&cache->lock); - if (reserve) { - if (cache->ro) { - ret = -EAGAIN; - } else { - cache->reserved += num_bytes; - space_info->bytes_reserved += num_bytes; - } - } else { - if (cache->ro) - space_info->bytes_readonly += num_bytes; - cache->reserved -= num_bytes; - space_info->bytes_reserved -= num_bytes; - space_info->reservation_progress++; - } - spin_unlock(&cache->lock); - spin_unlock(&space_info->lock); - } else { - spin_lock(&cache->lock); + spin_lock(&space_info->lock); + spin_lock(&cache->lock); + if (reserve != RESERVE_FREE) { if (cache->ro) { ret = -EAGAIN; } else { - if (reserve) - cache->reserved += num_bytes; - else - cache->reserved -= num_bytes; + cache->reserved += num_bytes; + space_info->bytes_reserved += num_bytes; + if (reserve == RESERVE_ALLOC) { + BUG_ON(space_info->bytes_may_use < num_bytes); + space_info->bytes_may_use -= num_bytes; + } } - spin_unlock(&cache->lock); + } else { + if (cache->ro) + space_info->bytes_readonly += num_bytes; + cache->reserved -= num_bytes; + space_info->bytes_reserved -= num_bytes; + space_info->reservation_progress++; } + spin_unlock(&cache->lock); + spin_unlock(&space_info->lock); return ret; } @@ -4322,7 +4351,7 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) } else if (cache->reserved_pinned > 0) { len = min(len, cache->reserved_pinned); cache->reserved_pinned -= len; - cache->space_info->bytes_reserved += len; + cache->space_info->bytes_may_use += len; } spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); @@ -4701,27 +4730,8 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); btrfs_add_free_space(cache, buf->start, buf->len); - ret = btrfs_update_reserved_bytes(cache, buf->len, 0, 0); - if (ret == -EAGAIN) { - /* block group became read-only */ - btrfs_update_reserved_bytes(cache, buf->len, 0, 1); - goto out; - } + btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE); - ret = 1; - spin_lock(&block_rsv->lock); - if (block_rsv->reserved < block_rsv->size) { - block_rsv->reserved += buf->len; - ret = 0; - } - spin_unlock(&block_rsv->lock); - - if (ret) { - spin_lock(&cache->space_info->lock); - cache->space_info->bytes_reserved -= buf->len; - cache->space_info->reservation_progress++; - spin_unlock(&cache->space_info->lock); - } goto out; } pin: @@ -4881,6 +4891,8 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, int last_ptr_loop = 0; int loop = 0; int index = 0; + int alloc_type = (data & BTRFS_BLOCK_GROUP_DATA) ? + RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC; bool found_uncached_bg = false; bool failed_cluster_refill = false; bool failed_alloc = false; @@ -5200,8 +5212,8 @@ checks: search_start - offset); BUG_ON(offset > search_start); - ret = btrfs_update_reserved_bytes(block_group, num_bytes, 1, - (data & BTRFS_BLOCK_GROUP_DATA)); + ret = btrfs_update_reserved_bytes(block_group, num_bytes, + alloc_type); if (ret == -EAGAIN) { btrfs_add_free_space(block_group, offset, num_bytes); goto loop; @@ -5323,7 +5335,8 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes, int index = 0; spin_lock(&info->lock); - printk(KERN_INFO "space_info has %llu free, is %sfull\n", + printk(KERN_INFO "space_info %llu has %llu free, is %sfull\n", + (unsigned long long)info->flags, (unsigned long long)(info->total_bytes - info->bytes_used - info->bytes_pinned - info->bytes_reserved - info->bytes_readonly), @@ -5425,7 +5438,7 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len) ret = btrfs_discard_extent(root, start, len, NULL); btrfs_add_free_space(cache, start, len); - btrfs_update_reserved_bytes(cache, len, 0, 1); + btrfs_update_reserved_bytes(cache, len, RESERVE_FREE); btrfs_put_block_group(cache); trace_btrfs_reserved_extent_free(root, start, len); @@ -5628,7 +5641,8 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, put_caching_control(caching_ctl); } - ret = btrfs_update_reserved_bytes(block_group, ins->offset, 1, 1); + ret = btrfs_update_reserved_bytes(block_group, ins->offset, + RESERVE_ALLOC_NO_ACCOUNT); BUG_ON(ret); btrfs_put_block_group(block_group); ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, @@ -6594,7 +6608,7 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force) cache->reserved_pinned + num_bytes + min_allocable_bytes <= sinfo->total_bytes) { sinfo->bytes_readonly += num_bytes; - sinfo->bytes_reserved += cache->reserved_pinned; + sinfo->bytes_may_use += cache->reserved_pinned; cache->reserved_pinned = 0; cache->ro = 1; ret = 0; @@ -6962,7 +6976,8 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) struct btrfs_space_info, list); if (space_info->bytes_pinned > 0 || - space_info->bytes_reserved > 0) { + space_info->bytes_reserved > 0 || + space_info->bytes_may_use > 0) { WARN_ON(1); dump_space_info(space_info, 0, 0); } diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 41ac927401d0..79c16a68a2bc 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -2472,9 +2472,19 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group, spin_unlock(&ctl->tree_lock); if (bytes >= minlen) { - int update_ret; - update_ret = btrfs_update_reserved_bytes(block_group, - bytes, 1, 1); + struct btrfs_space_info *space_info; + int update = 0; + + space_info = block_group->space_info; + spin_lock(&space_info->lock); + spin_lock(&block_group->lock); + if (!block_group->ro) { + block_group->reserved += bytes; + space_info->bytes_reserved += bytes; + update = 1; + } + spin_unlock(&block_group->lock); + spin_unlock(&space_info->lock); ret = btrfs_error_discard_extent(fs_info->extent_root, start, @@ -2482,9 +2492,16 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group, &actually_trimmed); btrfs_add_free_space(block_group, start, bytes); - if (!update_ret) - btrfs_update_reserved_bytes(block_group, - bytes, 0, 1); + if (update) { + spin_lock(&space_info->lock); + spin_lock(&block_group->lock); + if (block_group->ro) + space_info->bytes_readonly += bytes; + block_group->reserved -= bytes; + space_info->bytes_reserved -= bytes; + spin_unlock(&space_info->lock); + spin_unlock(&block_group->lock); + } if (ret) break; -- cgit v1.2.3 From 7709cde33f12db71efb377fae4ae7aab6c94ebc6 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 4 Aug 2011 10:25:02 -0400 Subject: Btrfs: calculate checksum space correctly We have not been reserving enough space for checksums. We were just reserving bytes for the checksum items themselves, we were not taking into account having to cow the tree and such. This patch adds a csum_bytes counter to the inode for keeping track of the number of bytes outstanding we have for checksums. Then we calculate how many leaves would be required for the checksums we are given and use that to reserve space. This adds a significant amount of bytes to our reservations, but we will handle this later. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/btrfs_inode.h | 6 +++ fs/btrfs/extent-tree.c | 117 +++++++++++++++++++++++++++++++++++++++++++++---- fs/btrfs/inode.c | 3 ++ 3 files changed, 118 insertions(+), 8 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index c70fb10a307b..5a5d325a3935 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -123,6 +123,12 @@ struct btrfs_inode { */ u64 last_unlink_trans; + /* + * Number of bytes outstanding that are going to need csums. This is + * used in ENOSPC accounting. + */ + u64 csum_bytes; + /* flags field from the on disk inode */ u32 flags; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index fbe6278f466b..4add1ac2dda0 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3984,11 +3984,19 @@ int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans, return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); } +/** + * drop_outstanding_extent - drop an outstanding extent + * @inode: the inode we're dropping the extent for + * + * This is called when we are freeing up an outstanding extent, either called + * after an error or after an extent is written. This will return the number of + * reserved extents that need to be freed. This must be called with + * BTRFS_I(inode)->lock held. + */ static unsigned drop_outstanding_extent(struct inode *inode) { unsigned dropped_extents = 0; - spin_lock(&BTRFS_I(inode)->lock); BUG_ON(!BTRFS_I(inode)->outstanding_extents); BTRFS_I(inode)->outstanding_extents--; @@ -3998,19 +4006,70 @@ static unsigned drop_outstanding_extent(struct inode *inode) */ if (BTRFS_I(inode)->outstanding_extents >= BTRFS_I(inode)->reserved_extents) - goto out; + return 0; dropped_extents = BTRFS_I(inode)->reserved_extents - BTRFS_I(inode)->outstanding_extents; BTRFS_I(inode)->reserved_extents -= dropped_extents; -out: - spin_unlock(&BTRFS_I(inode)->lock); return dropped_extents; } -static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes) +/** + * calc_csum_metadata_size - return the amount of metada space that must be + * reserved/free'd for the given bytes. + * @inode: the inode we're manipulating + * @num_bytes: the number of bytes in question + * @reserve: 1 if we are reserving space, 0 if we are freeing space + * + * This adjusts the number of csum_bytes in the inode and then returns the + * correct amount of metadata that must either be reserved or freed. We + * calculate how many checksums we can fit into one leaf and then divide the + * number of bytes that will need to be checksumed by this value to figure out + * how many checksums will be required. If we are adding bytes then the number + * may go up and we will return the number of additional bytes that must be + * reserved. If it is going down we will return the number of bytes that must + * be freed. + * + * This must be called with BTRFS_I(inode)->lock held. + */ +static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes, + int reserve) { - return num_bytes >>= 3; + struct btrfs_root *root = BTRFS_I(inode)->root; + u64 csum_size; + int num_csums_per_leaf; + int num_csums; + int old_csums; + + if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM && + BTRFS_I(inode)->csum_bytes == 0) + return 0; + + old_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize); + if (reserve) + BTRFS_I(inode)->csum_bytes += num_bytes; + else + BTRFS_I(inode)->csum_bytes -= num_bytes; + csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item); + num_csums_per_leaf = (int)div64_u64(csum_size, + sizeof(struct btrfs_csum_item) + + sizeof(struct btrfs_disk_key)); + num_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize); + num_csums = num_csums + num_csums_per_leaf - 1; + num_csums = num_csums / num_csums_per_leaf; + + old_csums = old_csums + num_csums_per_leaf - 1; + old_csums = old_csums / num_csums_per_leaf; + + /* No change, no need to reserve more */ + if (old_csums == num_csums) + return 0; + + if (reserve) + return btrfs_calc_trans_metadata_size(root, + num_csums - old_csums); + + return btrfs_calc_trans_metadata_size(root, old_csums - num_csums); } int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) @@ -4037,9 +4096,9 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents); } + to_reserve += calc_csum_metadata_size(inode, num_bytes, 1); spin_unlock(&BTRFS_I(inode)->lock); - to_reserve += calc_csum_metadata_size(inode, num_bytes); ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1); if (ret) { unsigned dropped; @@ -4047,8 +4106,11 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) * We don't need the return value since our reservation failed, * we just need to clean up our counter. */ + spin_lock(&BTRFS_I(inode)->lock); dropped = drop_outstanding_extent(inode); WARN_ON(dropped > 1); + BTRFS_I(inode)->csum_bytes -= num_bytes; + spin_unlock(&BTRFS_I(inode)->lock); return ret; } @@ -4057,6 +4119,15 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) return 0; } +/** + * btrfs_delalloc_release_metadata - release a metadata reservation for an inode + * @inode: the inode to release the reservation for + * @num_bytes: the number of bytes we're releasing + * + * This will release the metadata reservation for an inode. This can be called + * once we complete IO for a given set of bytes to release their metadata + * reservations. + */ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) { struct btrfs_root *root = BTRFS_I(inode)->root; @@ -4064,9 +4135,11 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) unsigned dropped; num_bytes = ALIGN(num_bytes, root->sectorsize); + spin_lock(&BTRFS_I(inode)->lock); dropped = drop_outstanding_extent(inode); - to_free = calc_csum_metadata_size(inode, num_bytes); + to_free = calc_csum_metadata_size(inode, num_bytes, 0); + spin_unlock(&BTRFS_I(inode)->lock); if (dropped > 0) to_free += btrfs_calc_trans_metadata_size(root, dropped); @@ -4074,6 +4147,21 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) to_free); } +/** + * btrfs_delalloc_reserve_space - reserve data and metadata space for delalloc + * @inode: inode we're writing to + * @num_bytes: the number of bytes we want to allocate + * + * This will do the following things + * + * o reserve space in the data space info for num_bytes + * o reserve space in the metadata space info based on number of outstanding + * extents and how much csums will be needed + * o add to the inodes ->delalloc_bytes + * o add it to the fs_info's delalloc inodes list. + * + * This will return 0 for success and -ENOSPC if there is no space left. + */ int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes) { int ret; @@ -4091,6 +4179,19 @@ int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes) return 0; } +/** + * btrfs_delalloc_release_space - release data and metadata space for delalloc + * @inode: inode we're releasing space for + * @num_bytes: the number of bytes we want to free up + * + * This must be matched with a call to btrfs_delalloc_reserve_space. This is + * called in the case that we don't need the metadata AND data reservations + * anymore. So if there is an error or we insert an inline extent. + * + * This function will release the metadata space that was not used and will + * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes + * list if there are no delalloc bytes left. + */ void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes) { btrfs_delalloc_release_metadata(inode, num_bytes); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 156c3a0da792..98b9fa2d77f7 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6757,6 +6757,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->delalloc_bytes = 0; ei->disk_i_size = 0; ei->flags = 0; + ei->csum_bytes = 0; ei->index_cnt = (u64)-1; ei->last_unlink_trans = 0; @@ -6802,6 +6803,8 @@ void btrfs_destroy_inode(struct inode *inode) WARN_ON(inode->i_data.nrpages); WARN_ON(BTRFS_I(inode)->outstanding_extents); WARN_ON(BTRFS_I(inode)->reserved_extents); + WARN_ON(BTRFS_I(inode)->delalloc_bytes); + WARN_ON(BTRFS_I(inode)->csum_bytes); /* * This can happen where we create an inode, but somebody else also -- cgit v1.2.3 From 37be25bcb6d731914e126f8de59c4367f0d66b80 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 5 Aug 2011 10:25:38 -0400 Subject: Btrfs: kill the durable block rsv stuff This is confusing code and isn't used by anything anymore, so delete it. Signed-off-by: Josef Bacik --- fs/btrfs/ctree.h | 11 ------ fs/btrfs/disk-io.c | 2 - fs/btrfs/extent-tree.c | 100 +++++++++---------------------------------------- fs/btrfs/inode.c | 4 -- fs/btrfs/relocation.c | 1 - 5 files changed, 17 insertions(+), 101 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 4ef777e85c89..c5ceba4078cc 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -772,13 +772,10 @@ struct btrfs_space_info { struct btrfs_block_rsv { u64 size; u64 reserved; - u64 freed[2]; struct btrfs_space_info *space_info; - struct list_head list; spinlock_t lock; atomic_t usage; unsigned int priority:8; - unsigned int durable:1; unsigned int refill_used:1; unsigned int full:1; }; @@ -840,7 +837,6 @@ struct btrfs_block_group_cache { spinlock_t lock; u64 pinned; u64 reserved; - u64 reserved_pinned; u64 bytes_super; u64 flags; u64 sectorsize; @@ -919,11 +915,6 @@ struct btrfs_fs_info { struct btrfs_block_rsv empty_block_rsv; - /* list of block reservations that cross multiple transactions */ - struct list_head durable_block_rsv_list; - - struct mutex durable_block_rsv_mutex; - u64 generation; u64 last_trans_committed; @@ -2238,8 +2229,6 @@ void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv); struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root); void btrfs_free_block_rsv(struct btrfs_root *root, struct btrfs_block_rsv *rsv); -void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *rsv); int btrfs_block_rsv_add(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 07b3ac662e19..0b5643a68d57 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1665,8 +1665,6 @@ struct btrfs_root *open_ctree(struct super_block *sb, btrfs_init_block_rsv(&fs_info->trans_block_rsv); btrfs_init_block_rsv(&fs_info->chunk_block_rsv); btrfs_init_block_rsv(&fs_info->empty_block_rsv); - INIT_LIST_HEAD(&fs_info->durable_block_rsv_list); - mutex_init(&fs_info->durable_block_rsv_mutex); atomic_set(&fs_info->nr_async_submits, 0); atomic_set(&fs_info->async_delalloc_pages, 0); atomic_set(&fs_info->async_submit_draining, 0); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 4add1ac2dda0..30c0558eae84 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -121,7 +121,6 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache) if (atomic_dec_and_test(&cache->count)) { WARN_ON(cache->pinned > 0); WARN_ON(cache->reserved > 0); - WARN_ON(cache->reserved_pinned > 0); kfree(cache->free_space_ctl); kfree(cache); } @@ -3662,7 +3661,6 @@ void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv) spin_lock_init(&rsv->lock); atomic_set(&rsv->usage, 1); rsv->priority = 6; - INIT_LIST_HEAD(&rsv->list); } struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root) @@ -3685,25 +3683,10 @@ void btrfs_free_block_rsv(struct btrfs_root *root, { if (rsv && atomic_dec_and_test(&rsv->usage)) { btrfs_block_rsv_release(root, rsv, (u64)-1); - if (!rsv->durable) - kfree(rsv); + kfree(rsv); } } -/* - * make the block_rsv struct be able to capture freed space. - * the captured space will re-add to the the block_rsv struct - * after transaction commit - */ -void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *block_rsv) -{ - block_rsv->durable = 1; - mutex_lock(&fs_info->durable_block_rsv_mutex); - list_add_tail(&block_rsv->list, &fs_info->durable_block_rsv_list); - mutex_unlock(&fs_info->durable_block_rsv_mutex); -} - int btrfs_block_rsv_add(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, @@ -3745,9 +3728,7 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans, ret = 0; } else { num_bytes -= block_rsv->reserved; - if (block_rsv->durable && - block_rsv->freed[0] + block_rsv->freed[1] >= num_bytes) - commit_trans = 1; + commit_trans = 1; } spin_unlock(&block_rsv->lock); if (!ret) @@ -3763,8 +3744,18 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans, } if (commit_trans) { + struct btrfs_space_info *sinfo = block_rsv->space_info; + if (trans) return -EAGAIN; + + spin_lock(&sinfo->lock); + if (sinfo->bytes_pinned < num_bytes) { + spin_unlock(&sinfo->lock); + return -ENOSPC; + } + spin_unlock(&sinfo->lock); + trans = btrfs_join_transaction(root); BUG_ON(IS_ERR(trans)); ret = btrfs_commit_transaction(trans, root); @@ -3885,10 +3876,6 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info) fs_info->tree_root->block_rsv = &fs_info->global_block_rsv; fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv; - btrfs_add_durable_block_rsv(fs_info, &fs_info->global_block_rsv); - - btrfs_add_durable_block_rsv(fs_info, &fs_info->delalloc_block_rsv); - update_global_block_rsv(fs_info); } @@ -4447,13 +4434,8 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) spin_lock(&cache->lock); cache->pinned -= len; cache->space_info->bytes_pinned -= len; - if (cache->ro) { + if (cache->ro) cache->space_info->bytes_readonly += len; - } else if (cache->reserved_pinned > 0) { - len = min(len, cache->reserved_pinned); - cache->reserved_pinned -= len; - cache->space_info->bytes_may_use += len; - } spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); } @@ -4468,11 +4450,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, { struct btrfs_fs_info *fs_info = root->fs_info; struct extent_io_tree *unpin; - struct btrfs_block_rsv *block_rsv; - struct btrfs_block_rsv *next_rsv; u64 start; u64 end; - int idx; int ret; if (fs_info->pinned_extents == &fs_info->freed_extents[0]) @@ -4495,30 +4474,6 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, cond_resched(); } - mutex_lock(&fs_info->durable_block_rsv_mutex); - list_for_each_entry_safe(block_rsv, next_rsv, - &fs_info->durable_block_rsv_list, list) { - - idx = trans->transid & 0x1; - if (block_rsv->freed[idx] > 0) { - block_rsv_add_bytes(block_rsv, - block_rsv->freed[idx], 0); - block_rsv->freed[idx] = 0; - } - if (atomic_read(&block_rsv->usage) == 0) { - btrfs_block_rsv_release(root, block_rsv, (u64)-1); - - if (block_rsv->freed[0] == 0 && - block_rsv->freed[1] == 0) { - list_del_init(&block_rsv->list); - kfree(block_rsv); - } - } else { - btrfs_block_rsv_release(root, block_rsv, 0); - } - } - mutex_unlock(&fs_info->durable_block_rsv_mutex); - return 0; } @@ -4820,36 +4775,18 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { ret = check_ref_cleanup(trans, root, buf->start); if (!ret) - goto pin; + goto out; } if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { pin_down_extent(root, cache, buf->start, buf->len, 1); - goto pin; + goto out; } WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); btrfs_add_free_space(cache, buf->start, buf->len); btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE); - - goto out; - } -pin: - if (block_rsv->durable && !cache->ro) { - ret = 0; - spin_lock(&cache->lock); - if (!cache->ro) { - cache->reserved_pinned += buf->len; - ret = 1; - } - spin_unlock(&cache->lock); - - if (ret) { - spin_lock(&block_rsv->lock); - block_rsv->freed[trans->transid & 0x1] += buf->len; - spin_unlock(&block_rsv->lock); - } } out: /* @@ -6705,12 +6642,9 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force) cache->bytes_super - btrfs_block_group_used(&cache->item); if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned + - sinfo->bytes_may_use + sinfo->bytes_readonly + - cache->reserved_pinned + num_bytes + min_allocable_bytes <= - sinfo->total_bytes) { + sinfo->bytes_may_use + sinfo->bytes_readonly + num_bytes + + min_allocable_bytes <= sinfo->total_bytes) { sinfo->bytes_readonly += num_bytes; - sinfo->bytes_may_use += cache->reserved_pinned; - cache->reserved_pinned = 0; cache->ro = 1; ret = 0; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 5f5f8a577e69..6402a41b9023 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2164,9 +2164,6 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) } spin_unlock(&root->orphan_lock); - if (block_rsv) - btrfs_add_durable_block_rsv(root->fs_info, block_rsv); - /* grab metadata reservation from transaction handle */ if (reserve) { ret = btrfs_orphan_reserve_metadata(trans, inode); @@ -6505,7 +6502,6 @@ static int btrfs_truncate(struct inode *inode) rsv = btrfs_alloc_block_rsv(root); if (!rsv) return -ENOMEM; - btrfs_add_durable_block_rsv(root->fs_info, rsv); trans = btrfs_start_transaction(root, 4); if (IS_ERR(trans)) { diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 59bb1764273d..545b04358249 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -3651,7 +3651,6 @@ int prepare_to_relocate(struct reloc_control *rc) return ret; rc->block_rsv->refill_used = 1; - btrfs_add_durable_block_rsv(rc->extent_root->fs_info, rc->block_rsv); memset(&rc->cluster, 0, sizeof(rc->cluster)); rc->search_start = rc->block_group->key.objectid; -- cgit v1.2.3 From dabdb6408cb801644fa613c7432da012640b348c Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 8 Aug 2011 12:50:18 -0400 Subject: Btrfs: kill unused parts of block_rsv The priority and refill_used flags are not used anymore, and neither is the usage counter, so just remove them from btrfs_block_rsv. Signed-off-by: Josef Bacik --- fs/btrfs/ctree.h | 3 --- fs/btrfs/extent-tree.c | 23 ++++++----------------- fs/btrfs/relocation.c | 2 -- 3 files changed, 6 insertions(+), 22 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index c5ceba4078cc..58a06dea4791 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -774,9 +774,6 @@ struct btrfs_block_rsv { u64 reserved; struct btrfs_space_info *space_info; spinlock_t lock; - atomic_t usage; - unsigned int priority:8; - unsigned int refill_used:1; unsigned int full:1; }; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 30c0558eae84..5395cc639270 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3659,8 +3659,6 @@ void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv) { memset(rsv, 0, sizeof(*rsv)); spin_lock_init(&rsv->lock); - atomic_set(&rsv->usage, 1); - rsv->priority = 6; } struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root) @@ -3681,10 +3679,8 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root) void btrfs_free_block_rsv(struct btrfs_root *root, struct btrfs_block_rsv *rsv) { - if (rsv && atomic_dec_and_test(&rsv->usage)) { - btrfs_block_rsv_release(root, rsv, (u64)-1); - kfree(rsv); - } + btrfs_block_rsv_release(root, rsv, (u64)-1); + kfree(rsv); } int btrfs_block_rsv_add(struct btrfs_trans_handle *trans, @@ -3734,13 +3730,10 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans, if (!ret) return 0; - if (block_rsv->refill_used) { - ret = reserve_metadata_bytes(trans, root, block_rsv, - num_bytes, 0); - if (!ret) { - block_rsv_add_bytes(block_rsv, num_bytes, 0); - return 0; - } + ret = reserve_metadata_bytes(trans, root, block_rsv, num_bytes, 0); + if (!ret) { + block_rsv_add_bytes(block_rsv, num_bytes, 0); + return 0; } if (commit_trans) { @@ -3859,16 +3852,12 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info) space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); fs_info->chunk_block_rsv.space_info = space_info; - fs_info->chunk_block_rsv.priority = 10; space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); fs_info->global_block_rsv.space_info = space_info; - fs_info->global_block_rsv.priority = 10; - fs_info->global_block_rsv.refill_used = 1; fs_info->delalloc_block_rsv.space_info = space_info; fs_info->trans_block_rsv.space_info = space_info; fs_info->empty_block_rsv.space_info = space_info; - fs_info->empty_block_rsv.priority = 10; fs_info->extent_root->block_rsv = &fs_info->global_block_rsv; fs_info->csum_root->block_rsv = &fs_info->global_block_rsv; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 545b04358249..aeaed99e9cfe 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -3650,8 +3650,6 @@ int prepare_to_relocate(struct reloc_control *rc) if (ret) return ret; - rc->block_rsv->refill_used = 1; - memset(&rc->cluster, 0, sizeof(rc->cluster)); rc->search_start = rc->block_group->key.objectid; rc->extents_found = 0; -- cgit v1.2.3 From 13553e5221d6901a33b3f2157a389de085c161fe Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 8 Aug 2011 13:33:21 -0400 Subject: Btrfs: don't try to commit in btrfs_block_rsv_check We will try and reserve metadata bytes in btrfs_block_rsv_check and if we cannot because we have a transaction open it will return EAGAIN, so we do not need to try and commit the transaction again. Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 29 ++++------------------------- 1 file changed, 4 insertions(+), 25 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 5395cc639270..6356ef2f0c80 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3708,7 +3708,6 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans, u64 min_reserved, int min_factor) { u64 num_bytes = 0; - int commit_trans = 0; int ret = -ENOSPC; if (!block_rsv) @@ -3720,13 +3719,12 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans, if (min_reserved > num_bytes) num_bytes = min_reserved; - if (block_rsv->reserved >= num_bytes) { + if (block_rsv->reserved >= num_bytes) ret = 0; - } else { + else num_bytes -= block_rsv->reserved; - commit_trans = 1; - } spin_unlock(&block_rsv->lock); + if (!ret) return 0; @@ -3736,26 +3734,7 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans, return 0; } - if (commit_trans) { - struct btrfs_space_info *sinfo = block_rsv->space_info; - - if (trans) - return -EAGAIN; - - spin_lock(&sinfo->lock); - if (sinfo->bytes_pinned < num_bytes) { - spin_unlock(&sinfo->lock); - return -ENOSPC; - } - spin_unlock(&sinfo->lock); - - trans = btrfs_join_transaction(root); - BUG_ON(IS_ERR(trans)); - ret = btrfs_commit_transaction(trans, root); - return 0; - } - - return -ENOSPC; + return ret; } int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, -- cgit v1.2.3 From 5e962c7850c273b483acc747b41bd5cddf631049 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 8 Aug 2011 14:03:37 -0400 Subject: Btrfs: kill btrfs_truncate_reserve_metadata Since we've optimized the truncate path, we no longer require this function. Signed-off-by: Josef Bacik --- fs/btrfs/ctree.h | 3 --- fs/btrfs/extent-tree.c | 31 ------------------------------- 2 files changed, 34 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 58a06dea4791..22a9347a3908 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2240,9 +2240,6 @@ int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, void btrfs_block_rsv_release(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, u64 num_bytes); -int btrfs_truncate_reserve_metadata(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_block_rsv *rsv); int btrfs_set_block_group_ro(struct btrfs_root *root, struct btrfs_block_group_cache *cache); int btrfs_set_block_group_rw(struct btrfs_root *root, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 6356ef2f0c80..d05967e9d613 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3858,37 +3858,6 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info) WARN_ON(fs_info->chunk_block_rsv.reserved > 0); } -int btrfs_truncate_reserve_metadata(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_block_rsv *rsv) -{ - struct btrfs_block_rsv *trans_rsv = &root->fs_info->trans_block_rsv; - u64 num_bytes; - int ret; - - /* - * Truncate should be freeing data, but give us 2 items just in case it - * needs to use some space. We may want to be smarter about this in the - * future. - */ - num_bytes = btrfs_calc_trans_metadata_size(root, 2); - - /* We already have enough bytes, just return */ - if (rsv->reserved >= num_bytes) - return 0; - - num_bytes -= rsv->reserved; - - /* - * You should have reserved enough space before hand to do this, so this - * should not fail. - */ - ret = block_rsv_migrate_bytes(trans_rsv, rsv, num_bytes); - BUG_ON(ret); - - return 0; -} - void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, struct btrfs_root *root) { -- cgit v1.2.3 From 482e6dc5261406fdb921946e70b51467b0305bad Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 19 Aug 2011 10:31:56 -0400 Subject: Btrfs: allow callers to specify if flushing can occur for btrfs_block_rsv_check If you run xfstest 224 it you will get lots of messages about not being able to delete inodes and that they will be cleaned up next mount. This is because btrfs_block_rsv_check was not calling reserve_metadata_bytes with the ability to flush, so if there was not enough space, it simply failed. But in truncate and evict case we could easily flush space to try and get enough space to do our work, so make btrfs_block_rsv_check take a flush argument to pass down to reserve_metadata_bytes. Now xfstests 224 runs fine without all those complaints. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/ctree.h | 2 +- fs/btrfs/extent-tree.c | 4 ++-- fs/btrfs/free-space-cache.c | 2 +- fs/btrfs/inode.c | 6 +++--- fs/btrfs/relocation.c | 4 ++-- fs/btrfs/transaction.c | 2 +- 6 files changed, 10 insertions(+), 10 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 2e18b068841b..caa73cd8c00a 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2244,7 +2244,7 @@ int btrfs_block_rsv_add(struct btrfs_trans_handle *trans, int btrfs_block_rsv_check(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, - u64 min_reserved, int min_factor); + u64 min_reserved, int min_factor, int flush); int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, struct btrfs_block_rsv *dst_rsv, u64 num_bytes); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index d05967e9d613..a71fcf506531 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3705,7 +3705,7 @@ int btrfs_block_rsv_add(struct btrfs_trans_handle *trans, int btrfs_block_rsv_check(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, - u64 min_reserved, int min_factor) + u64 min_reserved, int min_factor, int flush) { u64 num_bytes = 0; int ret = -ENOSPC; @@ -3728,7 +3728,7 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans, if (!ret) return 0; - ret = reserve_metadata_bytes(trans, root, block_rsv, num_bytes, 0); + ret = reserve_metadata_bytes(trans, root, block_rsv, num_bytes, flush); if (!ret) { block_rsv_add_bytes(block_rsv, num_bytes, 0); return 0; diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index ecc1a4f85d20..b0122e19db6b 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -199,7 +199,7 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root, trans->block_rsv = root->orphan_block_rsv; ret = btrfs_block_rsv_check(trans, root, root->orphan_block_rsv, - 0, 5); + 0, 5, 0); if (ret) return ret; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 8316b570db55..e40b9239660d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3576,10 +3576,10 @@ void btrfs_evict_inode(struct inode *inode) * doing the truncate. */ while (1) { - ret = btrfs_block_rsv_check(NULL, root, rsv, min_size, 0); + ret = btrfs_block_rsv_check(NULL, root, rsv, min_size, 0, 1); if (ret) { printk(KERN_WARNING "Could not get space for a " - "delete, will truncate on mount\n"); + "delete, will truncate on mount %d\n", ret); btrfs_orphan_del(NULL, inode); btrfs_free_block_rsv(root, rsv); goto no_delete; @@ -6575,7 +6575,7 @@ static int btrfs_truncate(struct inode *inode) btrfs_add_ordered_operation(trans, root, inode); while (1) { - ret = btrfs_block_rsv_check(trans, root, rsv, min_size, 0); + ret = btrfs_block_rsv_check(trans, root, rsv, min_size, 0, 1); if (ret) { /* * This can only happen with the original transaction we diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index aeaed99e9cfe..fd9ac66434b0 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2042,7 +2042,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, trans->block_rsv = rc->block_rsv; ret = btrfs_block_rsv_check(trans, root, rc->block_rsv, - min_reserved, 0); + min_reserved, 0, 0); if (ret) { BUG_ON(ret != -EAGAIN); ret = btrfs_commit_transaction(trans, root); @@ -3775,7 +3775,7 @@ restart: } ret = btrfs_block_rsv_check(trans, rc->extent_root, - rc->block_rsv, 0, 5); + rc->block_rsv, 0, 5, 0); if (ret < 0) { if (ret != -EAGAIN) { err = ret; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 3e20cc8c1c06..a1d8c322c1ba 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -419,7 +419,7 @@ static int should_end_transaction(struct btrfs_trans_handle *trans, { int ret; ret = btrfs_block_rsv_check(trans, root, - &root->fs_info->global_block_rsv, 0, 5); + &root->fs_info->global_block_rsv, 0, 5, 0); return ret ? 1 : 0; } -- cgit v1.2.3 From 7ed49f187c82821e35f8869399bcf90822a74a23 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 19 Aug 2011 15:45:52 -0400 Subject: Btrfs: fix space leak when we fail to make an allocation When changing back to using a spin_lock to protect the extent counters I decided that since we would only be dropping our original extent, it was ok to just drop the extent and return. However since somebody else could have come in and done a reservation, we need to do the normal song and dance to clear the reservation out properly. So calculate how much space we need to free, and then subtract what we just attempted to reserve. If it's more then we know we need to drop those bytes from the delalloc block rsv. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index a71fcf506531..99ab5716baad 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4025,16 +4025,24 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1); if (ret) { + u64 to_free = 0; unsigned dropped; - /* - * We don't need the return value since our reservation failed, - * we just need to clean up our counter. - */ + spin_lock(&BTRFS_I(inode)->lock); dropped = drop_outstanding_extent(inode); - WARN_ON(dropped > 1); - BTRFS_I(inode)->csum_bytes -= num_bytes; + to_free = calc_csum_metadata_size(inode, num_bytes, 0); spin_unlock(&BTRFS_I(inode)->lock); + to_free += btrfs_calc_trans_metadata_size(root, dropped); + + /* + * Somebody could have come in and twiddled with the + * reservation, so if we have to free more than we would have + * reserved from this reservation go ahead and release those + * bytes. + */ + to_free -= to_reserve; + if (to_free) + btrfs_block_rsv_release(root, block_rsv, to_free); return ret; } -- cgit v1.2.3 From 7f70150896ebd1169d9c43484c8c424f755353c4 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 22 Aug 2011 15:23:19 -0400 Subject: Btrfs: don't increase the block_rsv's size when emergency allocating space If we have to emergency reserve space we need to not increase the block_rsv size, otherwise we'll leak space. Take for instance delalloc, say we reserve 4k, and we use that 4k, and then we have to emergency allocate another 4k, we bump the size up to 8k, however we've only accounted for 4k in reservations in all of our supporting logic, so we'll go to free the 4k and end up having a size of 4k, which will cause us to later not free as much space. I saw this doing testing where I wasn't reserving enough space for something but was still leaking space, very frustrating. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 99ab5716baad..1f1d3e8dcec9 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5707,9 +5707,6 @@ use_block_rsv(struct btrfs_trans_handle *trans, ret = reserve_metadata_bytes(trans, root, block_rsv, blocksize, 0); if (!ret) { - spin_lock(&block_rsv->lock); - block_rsv->size += blocksize; - spin_unlock(&block_rsv->lock); return block_rsv; } else if (ret && block_rsv != global_rsv) { ret = block_rsv_use_bytes(global_rsv, blocksize); -- cgit v1.2.3 From c09544e07f8cdc455ed8615d4c067d694c33bd18 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 30 Aug 2011 10:19:10 -0400 Subject: Btrfs: handle enospc accounting for free space inodes Since free space inodes now use normal checksumming we need to make sure to account for their metadata use. So reserve metadata space, and then if we fail to write out the metadata we can just release it, otherwise it will be freed up when the io completes. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 18 +++++++++++++----- fs/btrfs/free-space-cache.c | 44 +++++++++++++++++++++++++++++--------------- fs/btrfs/inode-map.c | 6 ++++-- fs/btrfs/inode.c | 2 +- 4 files changed, 47 insertions(+), 23 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 1f1d3e8dcec9..ccdc4d12e8d4 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2755,16 +2755,20 @@ again: num_pages *= 16; num_pages *= PAGE_CACHE_SIZE; - ret = btrfs_check_data_free_space(inode, num_pages); + ret = btrfs_delalloc_reserve_space(inode, num_pages); if (ret) goto out_put; ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages, num_pages, num_pages, &alloc_hint); - if (!ret) + if (!ret) { dcs = BTRFS_DC_SETUP; - btrfs_free_reserved_data_space(inode, num_pages); + btrfs_free_reserved_data_space(inode, num_pages); + } else { + btrfs_delalloc_release_space(inode, num_pages); + } + out_put: iput(inode); out_free: @@ -4002,9 +4006,13 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv; u64 to_reserve = 0; unsigned nr_extents = 0; + int flush = 1; int ret; - if (btrfs_transaction_in_commit(root->fs_info)) + if (btrfs_is_free_space_inode(root, inode)) + flush = 0; + + if (flush && btrfs_transaction_in_commit(root->fs_info)) schedule_timeout(1); num_bytes = ALIGN(num_bytes, root->sectorsize); @@ -4023,7 +4031,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) to_reserve += calc_csum_metadata_size(inode, num_bytes, 1); spin_unlock(&BTRFS_I(inode)->lock); - ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1); + ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, flush); if (ret) { u64 to_free = 0; unsigned dropped; diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 1ea10731797a..3bde17ff14c0 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -532,6 +532,19 @@ out: return ret; } +/** + * __btrfs_write_out_cache - write out cached info to an inode + * @root - the root the inode belongs to + * @ctl - the free space cache we are going to write out + * @block_group - the block_group for this cache if it belongs to a block_group + * @trans - the trans handle + * @path - the path to use + * @offset - the offset for the key we'll insert + * + * This function writes out a free space cache struct to disk for quick recovery + * on mount. This will return 0 if it was successfull in writing the cache out, + * and -1 if it was not. + */ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, struct btrfs_free_space_ctl *ctl, struct btrfs_block_group_cache *block_group, @@ -555,7 +568,8 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, int index = 0, num_pages = 0; int entries = 0; int bitmaps = 0; - int ret = -1; + int ret; + int err = -1; bool next_page = false; bool out_of_space = false; @@ -563,7 +577,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, node = rb_first(&ctl->free_space_offset); if (!node) - return 0; + return -1; if (!i_size_read(inode)) return -1; @@ -767,7 +781,6 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, &cached_state, GFP_NOFS); - ret = 0; goto out; } @@ -789,10 +802,8 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, &cached_state, GFP_NOFS); - if (ret) { - ret = 0; + if (ret) goto out; - } BTRFS_I(inode)->generation = trans->transid; @@ -804,7 +815,6 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, ret = btrfs_search_slot(trans, root, &key, path, 0, 1); if (ret < 0) { - ret = -1; clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1, EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING, 0, 0, NULL, GFP_NOFS); @@ -818,7 +828,6 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID || found_key.offset != offset) { - ret = -1; clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1, EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING, 0, 0, NULL, @@ -835,16 +844,15 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); - ret = 1; - + err = 0; out: kfree(pages); - if (ret != 1) { + if (err) { invalidate_inode_pages2_range(inode->i_mapping, 0, index); BTRFS_I(inode)->generation = 0; } btrfs_update_inode(trans, root, inode); - return ret; + return err; } int btrfs_write_out_cache(struct btrfs_root *root, @@ -871,14 +879,16 @@ int btrfs_write_out_cache(struct btrfs_root *root, ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans, path, block_group->key.objectid); - if (ret < 0) { + if (ret) { + btrfs_delalloc_release_metadata(inode, inode->i_size); spin_lock(&block_group->lock); block_group->disk_cache_state = BTRFS_DC_ERROR; spin_unlock(&block_group->lock); ret = 0; - +#ifdef DEBUG printk(KERN_ERR "btrfs: failed to write free space cace " "for block group %llu\n", block_group->key.objectid); +#endif } iput(inode); @@ -2662,9 +2672,13 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root, return 0; ret = __btrfs_write_out_cache(root, inode, ctl, NULL, trans, path, 0); - if (ret < 0) + if (ret) { + btrfs_delalloc_release_metadata(inode, inode->i_size); +#ifdef DEBUG printk(KERN_ERR "btrfs: failed to write free ino cache " "for root %llu\n", root->root_key.objectid); +#endif + } iput(inode); return ret; diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index b4087e0fa871..53dcbdf446cd 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -465,14 +465,16 @@ again: /* Just to make sure we have enough space */ prealloc += 8 * PAGE_CACHE_SIZE; - ret = btrfs_check_data_free_space(inode, prealloc); + ret = btrfs_delalloc_reserve_space(inode, prealloc); if (ret) goto out_put; ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc, prealloc, prealloc, &alloc_hint); - if (ret) + if (ret) { + btrfs_delalloc_release_space(inode, prealloc); goto out_put; + } btrfs_free_reserved_data_space(inode, prealloc); out_put: diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 06ae5b173fd7..78b5ae59ac4f 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1792,11 +1792,11 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) } ret = 0; out: + btrfs_delalloc_release_metadata(inode, ordered_extent->len); if (nolock) { if (trans) btrfs_end_transaction_nolock(trans, root); } else { - btrfs_delalloc_release_metadata(inode, ordered_extent->len); if (trans) btrfs_end_transaction(trans, root); } -- cgit v1.2.3 From 4c13d758b7e79c14a0026c1f783f0c79e339b7bb Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 30 Aug 2011 11:31:29 -0400 Subject: Btrfs: use the transactions block_rsv for the csum root The alloc warnings everybody has been seeing is because we have been reserving space for csums, but we weren't actually using that space. So make get_block_rsv() return the trans->block_rsv if we're modifying the csum root. Also set the trans->block_rsv to NULL so that if we modify the csum root when running delayed ref's that comes out of the global reserve like it's supposed to. With this patch I'm not seeing those alloc warnings anymore. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 15 +++++++++------ fs/btrfs/transaction.c | 1 + 2 files changed, 10 insertions(+), 6 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index ccdc4d12e8d4..53f6dbdab510 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3565,10 +3565,12 @@ out: static struct btrfs_block_rsv *get_block_rsv(struct btrfs_trans_handle *trans, struct btrfs_root *root) { - struct btrfs_block_rsv *block_rsv; - if (root->ref_cows) + struct btrfs_block_rsv *block_rsv = NULL; + + if (root->ref_cows || root == root->fs_info->csum_root) block_rsv = trans->block_rsv; - else + + if (!block_rsv) block_rsv = root->block_rsv; if (!block_rsv) @@ -3865,12 +3867,13 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info) void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, struct btrfs_root *root) { + struct btrfs_block_rsv *block_rsv; + if (!trans->bytes_reserved) return; - BUG_ON(trans->block_rsv != &root->fs_info->trans_block_rsv); - btrfs_block_rsv_release(root, trans->block_rsv, - trans->bytes_reserved); + block_rsv = &root->fs_info->trans_block_rsv; + btrfs_block_rsv_release(root, block_rsv, trans->bytes_reserved); trans->bytes_reserved = 0; } diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index a1d8c322c1ba..a770f4bd9d31 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -453,6 +453,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, return 0; } + trans->block_rsv = NULL; while (count < 4) { unsigned long cur = trans->delayed_ref_updates; trans->delayed_ref_updates = 0; -- cgit v1.2.3 From d02c9955ded7fc56dd1edc987558b084ccb03eb4 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 30 Aug 2011 11:40:48 -0400 Subject: Btrfs: don't get the block_rsv in btrfs_free_tree_block Since the durable block rsv stuff has been killed there is no need to get the block_rsv in btrfs_free_tree_block anymore. Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 53f6dbdab510..d236df790156 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4707,7 +4707,6 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, struct extent_buffer *buf, u64 parent, int last_ref) { - struct btrfs_block_rsv *block_rsv; struct btrfs_block_group_cache *cache = NULL; int ret; @@ -4722,10 +4721,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, if (!last_ref) return; - block_rsv = get_block_rsv(trans, root); cache = btrfs_lookup_block_group(root->fs_info, buf->start); - if (block_rsv->space_info != cache->space_info) - goto out; if (btrfs_header_generation(buf) == trans->transid) { if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { -- cgit v1.2.3 From 4a92b1b8d2810db4ea0c34616b94c0b3810fa027 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 30 Aug 2011 12:34:28 -0400 Subject: Btrfs: stop passing a trans handle all around the reservation code The only thing that we need to have a trans handle for is in reserve_metadata_bytes and thats to know how much flushing we can do. So instead of passing it around, just check current->journal_info for a trans_handle so we know if we can commit a transaction to try and free up space or not. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/ctree.h | 6 ++---- fs/btrfs/extent-tree.c | 45 +++++++++++++++++++++++---------------------- fs/btrfs/free-space-cache.c | 4 +--- fs/btrfs/inode.c | 4 ++-- fs/btrfs/relocation.c | 15 +++++++-------- fs/btrfs/transaction.c | 8 ++++---- 6 files changed, 39 insertions(+), 43 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index caa73cd8c00a..a5faf8e33baa 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2237,12 +2237,10 @@ void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv); struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root); void btrfs_free_block_rsv(struct btrfs_root *root, struct btrfs_block_rsv *rsv); -int btrfs_block_rsv_add(struct btrfs_trans_handle *trans, - struct btrfs_root *root, +int btrfs_block_rsv_add(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, u64 num_bytes); -int btrfs_block_rsv_check(struct btrfs_trans_handle *trans, - struct btrfs_root *root, +int btrfs_block_rsv_check(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, u64 min_reserved, int min_factor, int flush); int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index d236df790156..9bb71597aa54 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3404,29 +3404,34 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, return reclaimed >= to_reclaim; } -/* - * Retries tells us how many times we've called reserve_metadata_bytes. The - * idea is if this is the first call (retries == 0) then we will add to our - * reserved count if we can't make the allocation in order to hold our place - * while we go and try and free up space. That way for retries > 1 we don't try - * and add space, we just check to see if the amount of unused space is >= the - * total space, meaning that our reservation is valid. +/** + * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space + * @root - the root we're allocating for + * @block_rsv - the block_rsv we're allocating for + * @orig_bytes - the number of bytes we want + * @flush - wether or not we can flush to make our reservation * - * However if we don't intend to retry this reservation, pass -1 as retries so - * that it short circuits this logic. + * This will reserve orgi_bytes number of bytes from the space info associated + * with the block_rsv. If there is not enough space it will make an attempt to + * flush out space to make room. It will do this by flushing delalloc if + * possible or committing the transaction. If flush is 0 then no attempts to + * regain reservations will be made and this will fail if there is not enough + * space already. */ -static int reserve_metadata_bytes(struct btrfs_trans_handle *trans, - struct btrfs_root *root, +static int reserve_metadata_bytes(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, u64 orig_bytes, int flush) { struct btrfs_space_info *space_info = block_rsv->space_info; + struct btrfs_trans_handle *trans; u64 unused; u64 num_bytes = orig_bytes; int retries = 0; int ret = 0; bool committed = false; bool flushing = false; + + trans = (struct btrfs_trans_handle *)current->journal_info; again: ret = 0; spin_lock(&space_info->lock); @@ -3689,8 +3694,7 @@ void btrfs_free_block_rsv(struct btrfs_root *root, kfree(rsv); } -int btrfs_block_rsv_add(struct btrfs_trans_handle *trans, - struct btrfs_root *root, +int btrfs_block_rsv_add(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, u64 num_bytes) { @@ -3699,7 +3703,7 @@ int btrfs_block_rsv_add(struct btrfs_trans_handle *trans, if (num_bytes == 0) return 0; - ret = reserve_metadata_bytes(trans, root, block_rsv, num_bytes, 1); + ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 1); if (!ret) { block_rsv_add_bytes(block_rsv, num_bytes, 1); return 0; @@ -3708,8 +3712,7 @@ int btrfs_block_rsv_add(struct btrfs_trans_handle *trans, return ret; } -int btrfs_block_rsv_check(struct btrfs_trans_handle *trans, - struct btrfs_root *root, +int btrfs_block_rsv_check(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, u64 min_reserved, int min_factor, int flush) { @@ -3734,7 +3737,7 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans, if (!ret) return 0; - ret = reserve_metadata_bytes(trans, root, block_rsv, num_bytes, flush); + ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); if (!ret) { block_rsv_add_bytes(block_rsv, num_bytes, 0); return 0; @@ -4034,7 +4037,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) to_reserve += calc_csum_metadata_size(inode, num_bytes, 1); spin_unlock(&BTRFS_I(inode)->lock); - ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, flush); + ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush); if (ret) { u64 to_free = 0; unsigned dropped; @@ -5689,8 +5692,7 @@ use_block_rsv(struct btrfs_trans_handle *trans, block_rsv = get_block_rsv(trans, root); if (block_rsv->size == 0) { - ret = reserve_metadata_bytes(trans, root, block_rsv, - blocksize, 0); + ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0); /* * If we couldn't reserve metadata bytes try and use some from * the global reserve. @@ -5711,8 +5713,7 @@ use_block_rsv(struct btrfs_trans_handle *trans, return block_rsv; if (ret) { WARN_ON(1); - ret = reserve_metadata_bytes(trans, root, block_rsv, blocksize, - 0); + ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0); if (!ret) { return block_rsv; } else if (ret && block_rsv != global_rsv) { diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 3bde17ff14c0..ffc42ef44711 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -197,9 +197,7 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root, rsv = trans->block_rsv; trans->block_rsv = root->orphan_block_rsv; - ret = btrfs_block_rsv_check(trans, root, - root->orphan_block_rsv, - 0, 5, 0); + ret = btrfs_block_rsv_check(root, root->orphan_block_rsv, 0, 5, 0); if (ret) return ret; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 78b5ae59ac4f..2947e94947b5 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3577,7 +3577,7 @@ void btrfs_evict_inode(struct inode *inode) * doing the truncate. */ while (1) { - ret = btrfs_block_rsv_check(NULL, root, rsv, min_size, 0, 1); + ret = btrfs_block_rsv_check(root, rsv, min_size, 0, 1); if (ret) { printk(KERN_WARNING "Could not get space for a " "delete, will truncate on mount %d\n", ret); @@ -6577,7 +6577,7 @@ static int btrfs_truncate(struct inode *inode) btrfs_add_ordered_operation(trans, root, inode); while (1) { - ret = btrfs_block_rsv_check(trans, root, rsv, min_size, 0, 1); + ret = btrfs_block_rsv_check(root, rsv, min_size, 0, 1); if (ret) { /* * This can only happen with the original transaction we diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index fd9ac66434b0..3ab67409f90f 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2041,8 +2041,8 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, BUG_ON(IS_ERR(trans)); trans->block_rsv = rc->block_rsv; - ret = btrfs_block_rsv_check(trans, root, rc->block_rsv, - min_reserved, 0, 0); + ret = btrfs_block_rsv_check(root, rc->block_rsv, min_reserved, + 0, 0); if (ret) { BUG_ON(ret != -EAGAIN); ret = btrfs_commit_transaction(trans, root); @@ -2152,8 +2152,7 @@ int prepare_to_merge(struct reloc_control *rc, int err) again: if (!err) { num_bytes = rc->merging_rsv_size; - ret = btrfs_block_rsv_add(NULL, root, rc->block_rsv, - num_bytes); + ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes); if (ret) err = ret; } @@ -2427,7 +2426,7 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans, num_bytes = calcu_metadata_size(rc, node, 1) * 2; trans->block_rsv = rc->block_rsv; - ret = btrfs_block_rsv_add(trans, root, rc->block_rsv, num_bytes); + ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes); if (ret) { if (ret == -EAGAIN) rc->commit_transaction = 1; @@ -3645,7 +3644,7 @@ int prepare_to_relocate(struct reloc_control *rc) * btrfs_init_reloc_root will use them when there * is no reservation in transaction handle. */ - ret = btrfs_block_rsv_add(NULL, rc->extent_root, rc->block_rsv, + ret = btrfs_block_rsv_add(rc->extent_root, rc->block_rsv, rc->extent_root->nodesize * 256); if (ret) return ret; @@ -3774,8 +3773,8 @@ restart: } } - ret = btrfs_block_rsv_check(trans, rc->extent_root, - rc->block_rsv, 0, 5, 0); + ret = btrfs_block_rsv_check(rc->extent_root, rc->block_rsv, 0, + 5, 0); if (ret < 0) { if (ret != -EAGAIN) { err = ret; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index a770f4bd9d31..8d6f4c78f73f 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -275,7 +275,7 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, */ if (num_items > 0 && root != root->fs_info->chunk_root) { num_bytes = btrfs_calc_trans_metadata_size(root, num_items); - ret = btrfs_block_rsv_add(NULL, root, + ret = btrfs_block_rsv_add(root, &root->fs_info->trans_block_rsv, num_bytes); if (ret) @@ -418,8 +418,8 @@ static int should_end_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root) { int ret; - ret = btrfs_block_rsv_check(trans, root, - &root->fs_info->global_block_rsv, 0, 5, 0); + ret = btrfs_block_rsv_check(root, &root->fs_info->global_block_rsv, 0, + 5, 0); return ret ? 1 : 0; } @@ -914,7 +914,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_reloc_pre_snapshot(trans, pending, &to_reserve); if (to_reserve > 0) { - ret = btrfs_block_rsv_add(trans, root, &pending->block_rsv, + ret = btrfs_block_rsv_add(root, &pending->block_rsv, to_reserve); if (ret) { pending->error = ret; -- cgit v1.2.3 From 455757c322cc0a0f2a692c5625dd88aaf6a7b889 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 19 Sep 2011 12:26:24 -0400 Subject: Btrfs: delay iput when deleting a block group I kept getting warnings from evict because we were calling btrfs_start_transaction() with a transaction already started when doing a balance. This is because we remove a block group which requires a transaction, and the put the last reference on the cache inode. Instead of doing this we need to delay the iput so it is done not within a transaction having started. This gets rid of our warnings. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 9bb71597aa54..5498bdacd4c3 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -7266,7 +7266,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, spin_unlock(&block_group->lock); } /* One for our lookup ref */ - iput(inode); + btrfs_add_delayed_iput(inode); } key.objectid = BTRFS_FREE_SPACE_OBJECTID; -- cgit v1.2.3 From ef3be45722317f8c2fb0e861065df0c3830ff9ac Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 22 Sep 2011 14:30:02 -0400 Subject: Btrfs: check unused against how much space we actually want There is a bug that may lead to early ENOSPC in our reservation code. We've been checking against num_bytes which may be above and beyond what we want to actually reserve, which could give us a false ENOSPC. Fix this by making sure the unused space is above how much we want to reserve and not how much we're trying to flush. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 5498bdacd4c3..fd65f6bc676c 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3472,7 +3472,7 @@ again: */ if (unused <= space_info->total_bytes) { unused = space_info->total_bytes - unused; - if (unused >= num_bytes) { + if (unused >= orig_bytes) { space_info->bytes_may_use += orig_bytes; ret = 0; } else { -- cgit v1.2.3 From 2bf64758fd6290797a5ce97d4b9c698a4ed1cbad Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 26 Sep 2011 17:12:22 -0400 Subject: Btrfs: allow us to overcommit our enospc reservations One of the things that kills us is the fact that our ENOSPC reservations are horribly over the top in most normal cases. There isn't too much that can be done about this because when we are completely full we really need them to work like this so we don't under reserve. However if there is plenty of unallocated chunks on the disk we can use that to gauge how much we can overcommit. So this patch adds chunk free space accounting so we always know how much unallocated space we have. Then if we fail to make a reservation within our allocated space, check to see if we can overcommit. In the normal flushing case (like with delalloc metadata reservations) we'll take the free space and divide it by 2 if our metadata profile is setup for DUP or any of those, and then divide it by 8 to make sure we don't overcommit too much. Then if we're in a non-flushing case (we really need this reservation now!) we only limit ourselves to half of the free space. This makes this fio test [torrent] filename=torrent-test rw=randwrite size=4g ioengine=sync directory=/mnt/btrfs-test go from taking around 45 minutes to 10 seconds on my freshly formatted 3 TiB file system. This doesn't seem to break my other enospc tests, but could really use some more testing as this is a super scary change. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/ctree.h | 4 ++++ fs/btrfs/disk-io.c | 2 ++ fs/btrfs/extent-tree.c | 61 ++++++++++++++++++++++++++++++++++++++------------ fs/btrfs/volumes.c | 39 ++++++++++++++++++++++++++++---- 4 files changed, 88 insertions(+), 18 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 47dea7118e0e..1eafccb162ee 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -893,6 +893,10 @@ struct btrfs_fs_info { spinlock_t block_group_cache_lock; struct rb_root block_group_cache_tree; + /* keep track of unallocated space */ + spinlock_t free_chunk_lock; + u64 free_chunk_space; + struct extent_io_tree freed_extents[2]; struct extent_io_tree *pinned_extents; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 4965a0179b31..51372a521167 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1648,6 +1648,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, spin_lock_init(&fs_info->fs_roots_radix_lock); spin_lock_init(&fs_info->delayed_iput_lock); spin_lock_init(&fs_info->defrag_inodes_lock); + spin_lock_init(&fs_info->free_chunk_lock); mutex_init(&fs_info->reloc_mutex); init_completion(&fs_info->kobj_unregister); @@ -1675,6 +1676,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, fs_info->metadata_ratio = 0; fs_info->defrag_inodes = RB_ROOT; fs_info->trans_no_join = 0; + fs_info->free_chunk_space = 0; fs_info->thread_pool_size = min_t(unsigned long, num_online_cpus() + 2, 8); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index fd65f6bc676c..25b69d0f9135 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3410,6 +3410,7 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, * @block_rsv - the block_rsv we're allocating for * @orig_bytes - the number of bytes we want * @flush - wether or not we can flush to make our reservation + * @check - wether this is just to check if we have enough space or not * * This will reserve orgi_bytes number of bytes from the space info associated * with the block_rsv. If there is not enough space it will make an attempt to @@ -3420,11 +3421,11 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, */ static int reserve_metadata_bytes(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, - u64 orig_bytes, int flush) + u64 orig_bytes, int flush, int check) { struct btrfs_space_info *space_info = block_rsv->space_info; struct btrfs_trans_handle *trans; - u64 unused; + u64 used; u64 num_bytes = orig_bytes; int retries = 0; int ret = 0; @@ -3459,9 +3460,9 @@ again: } ret = -ENOSPC; - unused = space_info->bytes_used + space_info->bytes_reserved + - space_info->bytes_pinned + space_info->bytes_readonly + - space_info->bytes_may_use; + used = space_info->bytes_used + space_info->bytes_reserved + + space_info->bytes_pinned + space_info->bytes_readonly + + space_info->bytes_may_use; /* * The idea here is that we've not already over-reserved the block group @@ -3470,9 +3471,8 @@ again: * lets start flushing stuff first and then come back and try to make * our reservation. */ - if (unused <= space_info->total_bytes) { - unused = space_info->total_bytes - unused; - if (unused >= orig_bytes) { + if (used <= space_info->total_bytes) { + if (used + orig_bytes <= space_info->total_bytes) { space_info->bytes_may_use += orig_bytes; ret = 0; } else { @@ -3489,10 +3489,43 @@ again: * amount plus the amount of bytes that we need for this * reservation. */ - num_bytes = unused - space_info->total_bytes + + num_bytes = used - space_info->total_bytes + (orig_bytes * (retries + 1)); } + if (ret && !check) { + u64 profile = btrfs_get_alloc_profile(root, 0); + u64 avail; + + spin_lock(&root->fs_info->free_chunk_lock); + avail = root->fs_info->free_chunk_space; + + /* + * If we have dup, raid1 or raid10 then only half of the free + * space is actually useable. + */ + if (profile & (BTRFS_BLOCK_GROUP_DUP | + BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10)) + avail >>= 1; + + /* + * If we aren't flushing don't let us overcommit too much, say + * 1/8th of the space. If we can flush, let it overcommit up to + * 1/2 of the space. + */ + if (flush) + avail >>= 3; + else + avail >>= 1; + spin_unlock(&root->fs_info->free_chunk_lock); + + if (used + orig_bytes < space_info->total_bytes + avail) { + space_info->bytes_may_use += orig_bytes; + ret = 0; + } + } + /* * Couldn't make our reservation, save our place so while we're trying * to reclaim space we can actually use it instead of somebody else @@ -3703,7 +3736,7 @@ int btrfs_block_rsv_add(struct btrfs_root *root, if (num_bytes == 0) return 0; - ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 1); + ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 1, 0); if (!ret) { block_rsv_add_bytes(block_rsv, num_bytes, 1); return 0; @@ -3737,7 +3770,7 @@ int btrfs_block_rsv_check(struct btrfs_root *root, if (!ret) return 0; - ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); + ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush, !flush); if (!ret) { block_rsv_add_bytes(block_rsv, num_bytes, 0); return 0; @@ -4037,7 +4070,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) to_reserve += calc_csum_metadata_size(inode, num_bytes, 1); spin_unlock(&BTRFS_I(inode)->lock); - ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush); + ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush, 0); if (ret) { u64 to_free = 0; unsigned dropped; @@ -5692,7 +5725,7 @@ use_block_rsv(struct btrfs_trans_handle *trans, block_rsv = get_block_rsv(trans, root); if (block_rsv->size == 0) { - ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0); + ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0, 0); /* * If we couldn't reserve metadata bytes try and use some from * the global reserve. @@ -5713,7 +5746,7 @@ use_block_rsv(struct btrfs_trans_handle *trans, return block_rsv; if (ret) { WARN_ON(1); - ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0); + ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0, 0); if (!ret) { return block_rsv; } else if (ret && block_rsv != global_rsv) { diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index f2a4cc79da61..e138af710de2 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1013,8 +1013,13 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, } BUG_ON(ret); - if (device->bytes_used > 0) - device->bytes_used -= btrfs_dev_extent_length(leaf, extent); + if (device->bytes_used > 0) { + u64 len = btrfs_dev_extent_length(leaf, extent); + device->bytes_used -= len; + spin_lock(&root->fs_info->free_chunk_lock); + root->fs_info->free_chunk_space += len; + spin_unlock(&root->fs_info->free_chunk_lock); + } ret = btrfs_del_item(trans, root, path); out: @@ -1356,6 +1361,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) if (ret) goto error_undo; + spin_lock(&root->fs_info->free_chunk_lock); + root->fs_info->free_chunk_space = device->total_bytes - + device->bytes_used; + spin_unlock(&root->fs_info->free_chunk_lock); + device->in_fs_metadata = 0; btrfs_scrub_cancel_dev(root, device); @@ -1691,6 +1701,10 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) root->fs_info->fs_devices->num_can_discard++; root->fs_info->fs_devices->total_rw_bytes += device->total_bytes; + spin_lock(&root->fs_info->free_chunk_lock); + root->fs_info->free_chunk_space += device->total_bytes; + spin_unlock(&root->fs_info->free_chunk_lock); + if (!blk_queue_nonrot(bdev_get_queue(bdev))) root->fs_info->fs_devices->rotating = 1; @@ -2192,8 +2206,12 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) lock_chunks(root); device->total_bytes = new_size; - if (device->writeable) + if (device->writeable) { device->fs_devices->total_rw_bytes -= diff; + spin_lock(&root->fs_info->free_chunk_lock); + root->fs_info->free_chunk_space -= diff; + spin_unlock(&root->fs_info->free_chunk_lock); + } unlock_chunks(root); again: @@ -2257,6 +2275,9 @@ again: device->total_bytes = old_size; if (device->writeable) device->fs_devices->total_rw_bytes += diff; + spin_lock(&root->fs_info->free_chunk_lock); + root->fs_info->free_chunk_space += diff; + spin_unlock(&root->fs_info->free_chunk_lock); unlock_chunks(root); goto done; } @@ -2615,6 +2636,11 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans, index++; } + spin_lock(&extent_root->fs_info->free_chunk_lock); + extent_root->fs_info->free_chunk_space -= (stripe_size * + map->num_stripes); + spin_unlock(&extent_root->fs_info->free_chunk_lock); + index = 0; stripe = &chunk->stripe; while (index < map->num_stripes) { @@ -3616,8 +3642,13 @@ static int read_one_dev(struct btrfs_root *root, fill_device_from_item(leaf, dev_item, device); device->dev_root = root->fs_info->dev_root; device->in_fs_metadata = 1; - if (device->writeable) + if (device->writeable) { device->fs_devices->total_rw_bytes += device->total_bytes; + spin_lock(&root->fs_info->free_chunk_lock); + root->fs_info->free_chunk_space += device->total_bytes - + device->bytes_used; + spin_unlock(&root->fs_info->free_chunk_lock); + } ret = 0; return ret; } -- cgit v1.2.3 From 73bc187680f94bed498f8a669103cad290e41180 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 3 Oct 2011 14:07:49 -0400 Subject: Btrfs: introduce mount option no_space_cache Some users have requested this and I've found I needed a way to disable cache loading without actually clearing the cache, so introduce the no_space_cache option. Before we check the super blocks cache generation field and if it was populated we always turned space caching on. Now we check this and set the space cache option on, and then parse the mount options so that if we want it off it get's turned off. Then we check the mount option all the places we do the caching work instead of checking the super's cache generation. This makes things more consistent and lets us turn space caching off. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 9 ++++----- fs/btrfs/super.c | 21 +++++++++++++++++---- fs/btrfs/transaction.c | 2 +- 3 files changed, 22 insertions(+), 10 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 25b69d0f9135..f9711a82fc54 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -481,7 +481,8 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, * we likely hold important locks. */ if (trans && (!trans->transaction->in_commit) && - (root && root != root->fs_info->tree_root)) { + (root && root != root->fs_info->tree_root) && + btrfs_test_opt(root, SPACE_CACHE)) { spin_lock(&cache->lock); if (cache->cached != BTRFS_CACHE_NO) { spin_unlock(&cache->lock); @@ -4223,7 +4224,7 @@ static int update_block_group(struct btrfs_trans_handle *trans, spin_lock(&cache->space_info->lock); spin_lock(&cache->lock); - if (btrfs_super_cache_generation(&info->super_copy) != 0 && + if (btrfs_test_opt(root, SPACE_CACHE) && cache->disk_cache_state < BTRFS_DC_CLEAR) cache->disk_cache_state = BTRFS_DC_CLEAR; @@ -7038,13 +7039,11 @@ int btrfs_read_block_groups(struct btrfs_root *root) path->reada = 1; cache_gen = btrfs_super_cache_generation(&root->fs_info->super_copy); - if (cache_gen != 0 && + if (btrfs_test_opt(root, SPACE_CACHE) && btrfs_super_generation(&root->fs_info->super_copy) != cache_gen) need_clear = 1; if (btrfs_test_opt(root, CLEAR_CACHE)) need_clear = 1; - if (!btrfs_test_opt(root, SPACE_CACHE) && cache_gen) - printk(KERN_INFO "btrfs: disk space caching is enabled\n"); while (1) { ret = find_first_block_group(root, path, &key); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 934789f7fd33..266d1f35465d 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -164,7 +164,7 @@ enum { Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard, Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed, Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, - Opt_inode_cache, Opt_err, + Opt_inode_cache, Opt_no_space_cache, Opt_err, }; static match_table_t tokens = { @@ -197,6 +197,7 @@ static match_table_t tokens = { {Opt_subvolrootid, "subvolrootid=%d"}, {Opt_defrag, "autodefrag"}, {Opt_inode_cache, "inode_cache"}, + {Opt_no_space_cache, "no_space_cache"}, {Opt_err, NULL}, }; @@ -208,14 +209,19 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) { struct btrfs_fs_info *info = root->fs_info; substring_t args[MAX_OPT_ARGS]; - char *p, *num, *orig; + char *p, *num, *orig = NULL; + u64 cache_gen; int intarg; int ret = 0; char *compress_type; bool compress_force = false; + cache_gen = btrfs_super_cache_generation(&root->fs_info->super_copy); + if (cache_gen) + btrfs_set_opt(info->mount_opt, SPACE_CACHE); + if (!options) - return 0; + goto out; /* * strsep changes the string, duplicate it because parse_options @@ -362,9 +368,12 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) btrfs_set_opt(info->mount_opt, DISCARD); break; case Opt_space_cache: - printk(KERN_INFO "btrfs: enabling disk space caching\n"); btrfs_set_opt(info->mount_opt, SPACE_CACHE); break; + case Opt_no_space_cache: + printk(KERN_INFO "btrfs: disabling disk space caching\n"); + btrfs_clear_opt(info->mount_opt, SPACE_CACHE); + break; case Opt_inode_cache: printk(KERN_INFO "btrfs: enabling inode map caching\n"); btrfs_set_opt(info->mount_opt, INODE_MAP_CACHE); @@ -393,6 +402,8 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) } } out: + if (!ret && btrfs_test_opt(root, SPACE_CACHE)) + printk(KERN_INFO "btrfs: disk space caching is enabled\n"); kfree(orig); return ret; } @@ -687,6 +698,8 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs) seq_puts(seq, ",noacl"); if (btrfs_test_opt(root, SPACE_CACHE)) seq_puts(seq, ",space_cache"); + else + seq_puts(seq, ",no_space_cache"); if (btrfs_test_opt(root, CLEAR_CACHE)) seq_puts(seq, ",clear_cache"); if (btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED)) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 45655793a2c5..1e1a4816ccb0 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1003,7 +1003,7 @@ static void update_super_roots(struct btrfs_root *root) super->root = root_item->bytenr; super->generation = root_item->generation; super->root_level = root_item->level; - if (super->cache_generation != 0 || btrfs_test_opt(root, SPACE_CACHE)) + if (btrfs_test_opt(root, SPACE_CACHE)) super->cache_generation = root_item->generation; } -- cgit v1.2.3 From 9a82ca659d8bfd99afc0e89bbde2202322df5755 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 5 Oct 2011 16:35:28 -0400 Subject: Btrfs: take overflow into account in reserving space My overcommit stuff can be a little racy when we're filling up the disk with fs_mark and we overcommit into things that quickly get used up for data. So use num_bytes to see if we have enough available space so we're less likely to overcommit ourselves out of the ability to make reservations. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index f9711a82fc54..f95e55083bdb 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3521,7 +3521,7 @@ again: avail >>= 1; spin_unlock(&root->fs_info->free_chunk_lock); - if (used + orig_bytes < space_info->total_bytes + avail) { + if (used + num_bytes < space_info->total_bytes + avail) { space_info->bytes_may_use += orig_bytes; ret = 0; } -- cgit v1.2.3 From 5b0e95bf607ddd59b39f52d3d55e6581c817b530 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 6 Oct 2011 08:58:24 -0400 Subject: Btrfs: inline checksums into the disk free space cache Yeah yeah I know this is how we used to do it and then I changed it, but damnit I'm changing it back. The fact is that writing out checksums will modify metadata, which could cause us to dirty a block group we've already written out, so we have to truncate it and all of it's checksums and re-write it which will write new checksums which could dirty a blockg roup that has already been written and you see where I'm going with this? This can cause unmount or really anything that depends on a transaction to commit to take it's sweet damned time to happen. So go back to the way it was, only this time we're specifically setting NODATACOW because we can't go through the COW pathway anyway and we're doing our own built-in cow'ing by truncating the free space cache. The other new thing is once we truncate the old cache and preallocate the new space, we don't need to do that song and dance at all for the rest of the transaction, we can just overwrite the existing space with the new cache if the block group changes for whatever reason, and the NODATACOW will let us do this fine. So keep track of which transaction we last cleared our cache in and if we cleared it in this transaction just say we're all setup and carry on. This survives xfstests and stress.sh. The inode cache will continue to use the normal csum infrastructure since it only gets written once and there will be no more modifications to the fs tree in a transaction commit. Signed-off-by: Josef Bacik --- fs/btrfs/ctree.h | 1 + fs/btrfs/extent-tree.c | 18 ++-- fs/btrfs/free-space-cache.c | 211 ++++++++++++++++++++++++++++++++------------ fs/btrfs/inode.c | 10 +-- 4 files changed, 172 insertions(+), 68 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 1eafccb162ee..ea60897a9171 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -838,6 +838,7 @@ struct btrfs_block_group_cache { u64 bytes_super; u64 flags; u64 sectorsize; + u64 cache_generation; unsigned int ro:1; unsigned int dirty:1; unsigned int iref:1; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index f95e55083bdb..0abf70c984e9 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2717,6 +2717,13 @@ again: goto again; } + /* We've already setup this transaction, go ahead and exit */ + if (block_group->cache_generation == trans->transid && + i_size_read(inode)) { + dcs = BTRFS_DC_SETUP; + goto out_put; + } + /* * We want to set the generation to 0, that way if anything goes wrong * from here on out we know not to trust this cache when we load up next @@ -2756,19 +2763,16 @@ again: num_pages *= 16; num_pages *= PAGE_CACHE_SIZE; - ret = btrfs_delalloc_reserve_space(inode, num_pages); + ret = btrfs_check_data_free_space(inode, num_pages); if (ret) goto out_put; ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages, num_pages, num_pages, &alloc_hint); - if (!ret) { + if (!ret) dcs = BTRFS_DC_SETUP; - btrfs_free_reserved_data_space(inode, num_pages); - } else { - btrfs_delalloc_release_space(inode, num_pages); - } + btrfs_free_reserved_data_space(inode, num_pages); out_put: iput(inode); @@ -2776,6 +2780,8 @@ out_free: btrfs_release_path(path); out: spin_lock(&block_group->lock); + if (!ret) + block_group->cache_generation = trans->transid; block_group->disk_cache_state = dcs; spin_unlock(&block_group->lock); diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index abc924c9467c..5d40c1ed8225 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -85,6 +85,7 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root, *block_group, struct btrfs_path *path) { struct inode *inode = NULL; + u32 flags = BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW; spin_lock(&block_group->lock); if (block_group->inode) @@ -99,9 +100,10 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root, return inode; spin_lock(&block_group->lock); - if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) { + if (!((BTRFS_I(inode)->flags & flags) == flags)) { printk(KERN_INFO "Old style space inode found, converting.\n"); - BTRFS_I(inode)->flags &= ~BTRFS_INODE_NODATASUM; + BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM | + BTRFS_INODE_NODATACOW; block_group->disk_cache_state = BTRFS_DC_CLEAR; } @@ -123,12 +125,17 @@ int __create_free_space_inode(struct btrfs_root *root, struct btrfs_free_space_header *header; struct btrfs_inode_item *inode_item; struct extent_buffer *leaf; + u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC; int ret; ret = btrfs_insert_empty_inode(trans, root, path, ino); if (ret) return ret; + /* We inline crc's for the free disk space cache */ + if (ino != BTRFS_FREE_INO_OBJECTID) + flags |= BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW; + leaf = path->nodes[0]; inode_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); @@ -141,8 +148,7 @@ int __create_free_space_inode(struct btrfs_root *root, btrfs_set_inode_uid(leaf, inode_item, 0); btrfs_set_inode_gid(leaf, inode_item, 0); btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600); - btrfs_set_inode_flags(leaf, inode_item, BTRFS_INODE_NOCOMPRESS | - BTRFS_INODE_PREALLOC); + btrfs_set_inode_flags(leaf, inode_item, flags); btrfs_set_inode_nlink(leaf, inode_item, 1); btrfs_set_inode_transid(leaf, inode_item, trans->transid); btrfs_set_inode_block_group(leaf, inode_item, offset); @@ -249,6 +255,7 @@ struct io_ctl { unsigned long size; int index; int num_pages; + unsigned check_crcs:1; }; static int io_ctl_init(struct io_ctl *io_ctl, struct inode *inode, @@ -262,6 +269,8 @@ static int io_ctl_init(struct io_ctl *io_ctl, struct inode *inode, if (!io_ctl->pages) return -ENOMEM; io_ctl->root = root; + if (btrfs_ino(inode) != BTRFS_FREE_INO_OBJECTID) + io_ctl->check_crcs = 1; return 0; } @@ -340,25 +349,39 @@ static void io_ctl_set_generation(struct io_ctl *io_ctl, u64 generation) io_ctl_map_page(io_ctl, 1); /* - * Skip the first 64bits to make sure theres a bogus crc for old - * kernels + * Skip the csum areas. If we don't check crcs then we just have a + * 64bit chunk at the front of the first page. */ - io_ctl->cur += sizeof(u64); + if (io_ctl->check_crcs) { + io_ctl->cur += (sizeof(u32) * io_ctl->num_pages); + io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages); + } else { + io_ctl->cur += sizeof(u64); + io_ctl->size -= sizeof(u64) * 2; + } val = io_ctl->cur; *val = cpu_to_le64(generation); io_ctl->cur += sizeof(u64); - io_ctl->size -= sizeof(u64) * 2; } static int io_ctl_check_generation(struct io_ctl *io_ctl, u64 generation) { u64 *gen; - io_ctl_map_page(io_ctl, 0); + /* + * Skip the crc area. If we don't check crcs then we just have a 64bit + * chunk at the front of the first page. + */ + if (io_ctl->check_crcs) { + io_ctl->cur += sizeof(u32) * io_ctl->num_pages; + io_ctl->size -= sizeof(u64) + + (sizeof(u32) * io_ctl->num_pages); + } else { + io_ctl->cur += sizeof(u64); + io_ctl->size -= sizeof(u64) * 2; + } - /* Skip the bogus crc area */ - io_ctl->cur += sizeof(u64); gen = io_ctl->cur; if (le64_to_cpu(*gen) != generation) { printk_ratelimited(KERN_ERR "btrfs: space cache generation " @@ -368,7 +391,63 @@ static int io_ctl_check_generation(struct io_ctl *io_ctl, u64 generation) return -EIO; } io_ctl->cur += sizeof(u64); - io_ctl->size -= sizeof(u64) * 2; + return 0; +} + +static void io_ctl_set_crc(struct io_ctl *io_ctl, int index) +{ + u32 *tmp; + u32 crc = ~(u32)0; + unsigned offset = 0; + + if (!io_ctl->check_crcs) { + io_ctl_unmap_page(io_ctl); + return; + } + + if (index == 0) + offset = sizeof(u32) * io_ctl->num_pages;; + + crc = btrfs_csum_data(io_ctl->root, io_ctl->orig + offset, crc, + PAGE_CACHE_SIZE - offset); + btrfs_csum_final(crc, (char *)&crc); + io_ctl_unmap_page(io_ctl); + tmp = kmap(io_ctl->pages[0]); + tmp += index; + *tmp = crc; + kunmap(io_ctl->pages[0]); +} + +static int io_ctl_check_crc(struct io_ctl *io_ctl, int index) +{ + u32 *tmp, val; + u32 crc = ~(u32)0; + unsigned offset = 0; + + if (!io_ctl->check_crcs) { + io_ctl_map_page(io_ctl, 0); + return 0; + } + + if (index == 0) + offset = sizeof(u32) * io_ctl->num_pages; + + tmp = kmap(io_ctl->pages[0]); + tmp += index; + val = *tmp; + kunmap(io_ctl->pages[0]); + + io_ctl_map_page(io_ctl, 0); + crc = btrfs_csum_data(io_ctl->root, io_ctl->orig + offset, crc, + PAGE_CACHE_SIZE - offset); + btrfs_csum_final(crc, (char *)&crc); + if (val != crc) { + printk_ratelimited(KERN_ERR "btrfs: csum mismatch on free " + "space cache\n"); + io_ctl_unmap_page(io_ctl); + return -EIO; + } + return 0; } @@ -391,22 +470,7 @@ static int io_ctl_add_entry(struct io_ctl *io_ctl, u64 offset, u64 bytes, if (io_ctl->size >= sizeof(struct btrfs_free_space_entry)) return 0; - /* - * index == 1 means the current page is 0, we need to generate a bogus - * crc for older kernels. - */ - if (io_ctl->index == 1) { - u32 *tmp; - u32 crc = ~(u32)0; - - crc = btrfs_csum_data(io_ctl->root, io_ctl->orig + sizeof(u64), - crc, PAGE_CACHE_SIZE - sizeof(u64)); - btrfs_csum_final(crc, (char *)&crc); - crc++; - tmp = io_ctl->orig; - *tmp = crc; - } - io_ctl_unmap_page(io_ctl); + io_ctl_set_crc(io_ctl, io_ctl->index - 1); /* No more pages to map */ if (io_ctl->index >= io_ctl->num_pages) @@ -427,14 +491,14 @@ static int io_ctl_add_bitmap(struct io_ctl *io_ctl, void *bitmap) * map the next one if there is any left. */ if (io_ctl->cur != io_ctl->orig) { - io_ctl_unmap_page(io_ctl); + io_ctl_set_crc(io_ctl, io_ctl->index - 1); if (io_ctl->index >= io_ctl->num_pages) return -ENOSPC; io_ctl_map_page(io_ctl, 0); } memcpy(io_ctl->cur, bitmap, PAGE_CACHE_SIZE); - io_ctl_unmap_page(io_ctl); + io_ctl_set_crc(io_ctl, io_ctl->index - 1); if (io_ctl->index < io_ctl->num_pages) io_ctl_map_page(io_ctl, 0); return 0; @@ -442,51 +506,60 @@ static int io_ctl_add_bitmap(struct io_ctl *io_ctl, void *bitmap) static void io_ctl_zero_remaining_pages(struct io_ctl *io_ctl) { - io_ctl_unmap_page(io_ctl); + /* + * If we're not on the boundary we know we've modified the page and we + * need to crc the page. + */ + if (io_ctl->cur != io_ctl->orig) + io_ctl_set_crc(io_ctl, io_ctl->index - 1); + else + io_ctl_unmap_page(io_ctl); while (io_ctl->index < io_ctl->num_pages) { io_ctl_map_page(io_ctl, 1); - io_ctl_unmap_page(io_ctl); + io_ctl_set_crc(io_ctl, io_ctl->index - 1); } } -static u8 io_ctl_read_entry(struct io_ctl *io_ctl, - struct btrfs_free_space *entry) +static int io_ctl_read_entry(struct io_ctl *io_ctl, + struct btrfs_free_space *entry, u8 *type) { struct btrfs_free_space_entry *e; - u8 type; e = io_ctl->cur; entry->offset = le64_to_cpu(e->offset); entry->bytes = le64_to_cpu(e->bytes); - type = e->type; + *type = e->type; io_ctl->cur += sizeof(struct btrfs_free_space_entry); io_ctl->size -= sizeof(struct btrfs_free_space_entry); if (io_ctl->size >= sizeof(struct btrfs_free_space_entry)) - return type; + return 0; io_ctl_unmap_page(io_ctl); if (io_ctl->index >= io_ctl->num_pages) - return type; + return 0; - io_ctl_map_page(io_ctl, 0); - return type; + return io_ctl_check_crc(io_ctl, io_ctl->index); } -static void io_ctl_read_bitmap(struct io_ctl *io_ctl, - struct btrfs_free_space *entry) +static int io_ctl_read_bitmap(struct io_ctl *io_ctl, + struct btrfs_free_space *entry) { - BUG_ON(!io_ctl->cur); - if (io_ctl->cur != io_ctl->orig) { + int ret; + + if (io_ctl->cur && io_ctl->cur != io_ctl->orig) io_ctl_unmap_page(io_ctl); - io_ctl_map_page(io_ctl, 0); - } + + ret = io_ctl_check_crc(io_ctl, io_ctl->index); + if (ret) + return ret; + memcpy(entry->bitmap, io_ctl->cur, PAGE_CACHE_SIZE); io_ctl_unmap_page(io_ctl); - if (io_ctl->index < io_ctl->num_pages) - io_ctl_map_page(io_ctl, 0); + + return 0; } int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, @@ -553,6 +626,10 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, if (ret) goto out; + ret = io_ctl_check_crc(&io_ctl, 0); + if (ret) + goto free_cache; + ret = io_ctl_check_generation(&io_ctl, generation); if (ret) goto free_cache; @@ -563,7 +640,12 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, if (!e) goto free_cache; - type = io_ctl_read_entry(&io_ctl, e); + ret = io_ctl_read_entry(&io_ctl, e, &type); + if (ret) { + kmem_cache_free(btrfs_free_space_cachep, e); + goto free_cache; + } + if (!e->bytes) { kmem_cache_free(btrfs_free_space_cachep, e); goto free_cache; @@ -611,7 +693,9 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, */ list_for_each_entry_safe(e, n, &bitmaps, list) { list_del_init(&e->list); - io_ctl_read_bitmap(&io_ctl, e); + ret = io_ctl_read_bitmap(&io_ctl, e); + if (ret) + goto free_cache; } io_ctl_drop_pages(&io_ctl); @@ -632,7 +716,7 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info, struct btrfs_root *root = fs_info->tree_root; struct inode *inode; struct btrfs_path *path; - int ret; + int ret = 0; bool matched; u64 used = btrfs_block_group_used(&block_group->item); @@ -664,6 +748,14 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info, return 0; } + /* We may have converted the inode and made the cache invalid. */ + spin_lock(&block_group->lock); + if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) { + spin_unlock(&block_group->lock); + goto out; + } + spin_unlock(&block_group->lock); + ret = __load_free_space_cache(fs_info->tree_root, inode, ctl, path, block_group->key.objectid); btrfs_free_path(path); @@ -774,6 +866,13 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, cluster = NULL; } + /* Make sure we can fit our crcs into the first page */ + if (io_ctl.check_crcs && + (io_ctl.num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE) { + WARN_ON(1); + goto out_nospc; + } + io_ctl_set_generation(&io_ctl, trans->transid); /* Write out the extent entries */ @@ -864,8 +963,8 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, ret = btrfs_search_slot(trans, root, &key, path, 0, 1); if (ret < 0) { clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, - EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING, 0, 0, NULL, GFP_NOFS); + EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL, + GFP_NOFS); goto out; } leaf = path->nodes[0]; @@ -878,9 +977,8 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, found_key.offset != offset) { clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, - EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING, 0, 0, NULL, - GFP_NOFS); + EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, + NULL, GFP_NOFS); btrfs_release_path(path); goto out; } @@ -942,7 +1040,6 @@ int btrfs_write_out_cache(struct btrfs_root *root, ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans, path, block_group->key.objectid); if (ret) { - btrfs_delalloc_release_metadata(inode, inode->i_size); spin_lock(&block_group->lock); block_group->disk_cache_state = BTRFS_DC_ERROR; spin_unlock(&block_group->lock); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 15adfb542502..246397d8478f 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1792,12 +1792,12 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) } ret = 0; out: - btrfs_delalloc_release_metadata(inode, ordered_extent->len); - if (nolock) { - if (trans) + if (root != root->fs_info->tree_root) + btrfs_delalloc_release_metadata(inode, ordered_extent->len); + if (trans) { + if (nolock) btrfs_end_transaction_nolock(trans, root); - } else { - if (trans) + else btrfs_end_transaction(trans, root); } -- cgit v1.2.3 From 4b91c14f913f649d4302b3677b85c4ce87a3d8e7 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 7 Oct 2011 11:55:34 -0400 Subject: Btrfs: wait for ordered extents if we didn't reclaim enough I noticed recently that my overcommit patch was causing one of my enospc tests to fail 25% of the time with early ENOSPC. This is because my overcommit patch was letting us go way over board, but it wasn't waiting long enough to let the delalloc shrinker do it's job. The problem is we just start writeback and wait a little bit hoping we flush enough, but we only free up delalloc space by having the writes complete all the way. We do this by waiting for ordered extents, which we do but only if we already free'd enough for the reservation, which isn't right, we should flush ordered extents if we didn't reclaim enough in case that will push us over the edge. With this patch I've not seen a failure in this enospc test after running it in a loop for an hour. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 0abf70c984e9..fc0de6880045 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3406,7 +3406,7 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, } } - if (reclaimed >= to_reclaim && !trans) + if (reclaimed < to_reclaim && !trans) btrfs_wait_ordered_extents(root, 0, 0); return reclaimed >= to_reclaim; } -- cgit v1.2.3 From bbb495c2ed9d34894cb3d6d366866fc92a2e1adc Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 14 Oct 2011 13:37:45 -0400 Subject: Btrfs: don't check bytes_pinned to determine if we should commit the transaction Before the only reason to commit the transaction to recover space in reserve_metadata_bytes() was if there were enough pinned_bytes to satisfy our reservation. But now we have the delayed inode stuff which will hold it's reservations until we commit the transaction. So say we max out our reservation by creating a bunch of files but don't have any pinned bytes we will ENOSPC out early even though we could commit the transaction and get that space back. So now just unconditionally commit the transaction since currently there is no way to know how much metadata space is being reserved by delayed inode stuff. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index fc0de6880045..79365a40cb3a 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3568,17 +3568,6 @@ again: goto again; } - /* - * Not enough space to be reclaimed, don't bother committing the - * transaction. - */ - spin_lock(&space_info->lock); - if (space_info->bytes_pinned < orig_bytes) - ret = -ENOSPC; - spin_unlock(&space_info->lock); - if (ret) - goto out; - ret = -EAGAIN; if (trans) goto out; -- cgit v1.2.3 From f104d044376aadcee74605d66b8d9dc2e145782c Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 14 Oct 2011 13:56:58 -0400 Subject: Btrfs: wait for ordered extents if we're in trouble when shrinking delalloc The only way we actually reclaim delalloc space is waiting for the IO to completely finish. Usually we kick off a bunch of IO and wait for a little bit and hope we can make our reservation, and usually this works out pretty well. With overcommit however we can get seriously underwater if we're filling up the disk quickly, so we need to be able to force the delalloc shrinker to wait for the ordered IO to finish to give us a better chance of actually reclaiming enough space to get our reservation. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 79365a40cb3a..96cbc5104959 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3334,7 +3334,8 @@ out: * shrink metadata reservation for delalloc */ static int shrink_delalloc(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 to_reclaim, int sync) + struct btrfs_root *root, u64 to_reclaim, + bool wait_ordered) { struct btrfs_block_rsv *block_rsv; struct btrfs_space_info *space_info; @@ -3387,11 +3388,15 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, if (trans && trans->transaction->blocked) return -EAGAIN; - time_left = schedule_timeout_interruptible(1); + if (wait_ordered && !trans) { + btrfs_wait_ordered_extents(root, 0, 0); + } else { + time_left = schedule_timeout_interruptible(1); - /* We were interrupted, exit */ - if (time_left) - break; + /* We were interrupted, exit */ + if (time_left) + break; + } /* we've kicked the IO a few times, if anything has been freed, * exit. There is no sense in looping here for a long time @@ -3406,8 +3411,7 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, } } - if (reclaimed < to_reclaim && !trans) - btrfs_wait_ordered_extents(root, 0, 0); + return reclaimed >= to_reclaim; } @@ -3438,6 +3442,7 @@ static int reserve_metadata_bytes(struct btrfs_root *root, int ret = 0; bool committed = false; bool flushing = false; + bool wait_ordered = false; trans = (struct btrfs_trans_handle *)current->journal_info; again: @@ -3496,6 +3501,7 @@ again: * amount plus the amount of bytes that we need for this * reservation. */ + wait_ordered = true; num_bytes = used - space_info->total_bytes + (orig_bytes * (retries + 1)); } @@ -3530,6 +3536,8 @@ again: if (used + num_bytes < space_info->total_bytes + avail) { space_info->bytes_may_use += orig_bytes; ret = 0; + } else { + wait_ordered = true; } } @@ -3552,7 +3560,7 @@ again: * We do synchronous shrinking since we don't actually unreserve * metadata until after the IO is completed. */ - ret = shrink_delalloc(trans, root, num_bytes, 1); + ret = shrink_delalloc(trans, root, num_bytes, wait_ordered); if (ret < 0) goto out; @@ -3564,6 +3572,7 @@ again: * so go back around and try again. */ if (retries < 2) { + wait_ordered = true; retries++; goto again; } -- cgit v1.2.3 From 877da174301dde9062b915da4c8103048be49702 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 14 Oct 2011 14:02:10 -0400 Subject: Btrfs: allow shrink_delalloc flush the needed reclaimed pages Currently we only allow a maximum of 2 megabytes of pages to be flushed at a time. This was ok before, but now we have overcommit which will screw us in a heartbeat if we are quickly filling the disk. So instead pick either 2 megabytes or the number of pages we need to reclaim to be safe again, which ever is larger. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 96cbc5104959..424ae82855c8 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3343,7 +3343,7 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, u64 max_reclaim; u64 reclaimed = 0; long time_left; - int nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT; + unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT; int loops = 0; unsigned long progress; @@ -3366,7 +3366,8 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, } max_reclaim = min(reserved, to_reclaim); - + nr_pages = max_t(unsigned long, nr_pages, + max_reclaim >> PAGE_CACHE_SHIFT); while (loops < 1024) { /* have the flusher threads jump in and do some IO */ smp_mb(); -- cgit v1.2.3 From b24e03db0df3e9164c9649db12fecc8c2d81b0d1 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 14 Oct 2011 14:40:17 -0400 Subject: Btrfs: release trans metadata bytes before flushing delayed refs We started setting trans->block_rsv = NULL to allow the delayed refs flushing stuff to use the right block_rsv and then just made btrfs_trans_release_metadata() unconditionally use the trans block rsv. The problem with this is we need to reserve some space in the transaction and then migrate it to the global block rsv, so we need to be able to free that out properly. So instead just move btrfs_trans_release_metadata() before the delayed ref flushing and use trans->block_rsv for the freeing. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 5 +---- fs/btrfs/transaction.c | 6 ++---- 2 files changed, 3 insertions(+), 8 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 424ae82855c8..eb4fe56b08bb 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3909,13 +3909,10 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info) void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, struct btrfs_root *root) { - struct btrfs_block_rsv *block_rsv; - if (!trans->bytes_reserved) return; - block_rsv = &root->fs_info->trans_block_rsv; - btrfs_block_rsv_release(root, block_rsv, trans->bytes_reserved); + btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved); trans->bytes_reserved = 0; } diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 1e1a4816ccb0..d064fa0a4a07 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -462,6 +462,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, return 0; } + btrfs_trans_release_metadata(trans, root); trans->block_rsv = NULL; while (count < 4) { unsigned long cur = trans->delayed_ref_updates; @@ -483,8 +484,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, count++; } - btrfs_trans_release_metadata(trans, root); - if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) && should_end_transaction(trans, root)) { trans->transaction->blocked = 1; @@ -1128,6 +1127,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, btrfs_run_ordered_operations(root, 0); + btrfs_trans_release_metadata(trans, root); trans->block_rsv = NULL; /* make a pass through all the delayed refs we have so far @@ -1136,8 +1136,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, ret = btrfs_run_delayed_refs(trans, root, 0); BUG_ON(ret); - btrfs_trans_release_metadata(trans, root); - cur_trans = trans->transaction; /* * set the flushing flag so procs in this transaction have to -- cgit v1.2.3 From 36ba022ac0b748dd543f43430b03198e899426c9 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 18 Oct 2011 12:15:48 -0400 Subject: Btrfs: seperate out btrfs_block_rsv_check out into 2 different functions Currently btrfs_block_rsv_check does 2 things, it will either refill a block reserve like in the truncate or refill case, or it will check to see if there is enough space in the global reserve and possibly refill it. However because of overcommit we could be well overcommitting ourselves just to try and refill the global reserve, when really we should just be committing the transaction. So breack this out into btrfs_block_rsv_refill and btrfs_block_rsv_check. Refill will try to reserve more metadata if it can and btrfs_block_rsv_check will not, it will only tell you if the factor of the total space is still reserved. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/ctree.h | 4 +++- fs/btrfs/extent-tree.c | 41 +++++++++++++++++++++++++++-------------- fs/btrfs/free-space-cache.c | 2 +- fs/btrfs/inode.c | 4 ++-- fs/btrfs/relocation.c | 6 ++---- fs/btrfs/transaction.c | 4 ++-- 6 files changed, 37 insertions(+), 24 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index ea60897a9171..227620993bce 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2252,8 +2252,10 @@ int btrfs_block_rsv_add(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, u64 num_bytes); int btrfs_block_rsv_check(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, int min_factor); +int btrfs_block_rsv_refill(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, - u64 min_reserved, int min_factor, int flush); + u64 min_reserved); int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, struct btrfs_block_rsv *dst_rsv, u64 num_bytes); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index eb4fe56b08bb..a5f1421eeee9 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3422,7 +3422,6 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, * @block_rsv - the block_rsv we're allocating for * @orig_bytes - the number of bytes we want * @flush - wether or not we can flush to make our reservation - * @check - wether this is just to check if we have enough space or not * * This will reserve orgi_bytes number of bytes from the space info associated * with the block_rsv. If there is not enough space it will make an attempt to @@ -3433,7 +3432,7 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, */ static int reserve_metadata_bytes(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, - u64 orig_bytes, int flush, int check) + u64 orig_bytes, int flush) { struct btrfs_space_info *space_info = block_rsv->space_info; struct btrfs_trans_handle *trans; @@ -3507,7 +3506,7 @@ again: (orig_bytes * (retries + 1)); } - if (ret && !check) { + if (ret) { u64 profile = btrfs_get_alloc_profile(root, 0); u64 avail; @@ -3742,7 +3741,7 @@ int btrfs_block_rsv_add(struct btrfs_root *root, if (num_bytes == 0) return 0; - ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 1, 0); + ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 1); if (!ret) { block_rsv_add_bytes(block_rsv, num_bytes, 1); return 0; @@ -3752,8 +3751,7 @@ int btrfs_block_rsv_add(struct btrfs_root *root, } int btrfs_block_rsv_check(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, - u64 min_reserved, int min_factor, int flush) + struct btrfs_block_rsv *block_rsv, int min_factor) { u64 num_bytes = 0; int ret = -ENOSPC; @@ -3762,11 +3760,26 @@ int btrfs_block_rsv_check(struct btrfs_root *root, return 0; spin_lock(&block_rsv->lock); - if (min_factor > 0) - num_bytes = div_factor(block_rsv->size, min_factor); - if (min_reserved > num_bytes) - num_bytes = min_reserved; + num_bytes = div_factor(block_rsv->size, min_factor); + if (block_rsv->reserved >= num_bytes) + ret = 0; + spin_unlock(&block_rsv->lock); + return ret; +} + +int btrfs_block_rsv_refill(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 min_reserved) +{ + u64 num_bytes = 0; + int ret = -ENOSPC; + + if (!block_rsv) + return 0; + + spin_lock(&block_rsv->lock); + num_bytes = min_reserved; if (block_rsv->reserved >= num_bytes) ret = 0; else @@ -3776,7 +3789,7 @@ int btrfs_block_rsv_check(struct btrfs_root *root, if (!ret) return 0; - ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush, !flush); + ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 1); if (!ret) { block_rsv_add_bytes(block_rsv, num_bytes, 0); return 0; @@ -4073,7 +4086,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) to_reserve += calc_csum_metadata_size(inode, num_bytes, 1); spin_unlock(&BTRFS_I(inode)->lock); - ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush, 0); + ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush); if (ret) { u64 to_free = 0; unsigned dropped; @@ -5728,7 +5741,7 @@ use_block_rsv(struct btrfs_trans_handle *trans, block_rsv = get_block_rsv(trans, root); if (block_rsv->size == 0) { - ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0, 0); + ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0); /* * If we couldn't reserve metadata bytes try and use some from * the global reserve. @@ -5749,7 +5762,7 @@ use_block_rsv(struct btrfs_trans_handle *trans, return block_rsv; if (ret) { WARN_ON(1); - ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0, 0); + ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0); if (!ret) { return block_rsv; } else if (ret && block_rsv != global_rsv) { diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 5d40c1ed8225..2fecfc3183ee 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -203,7 +203,7 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root, rsv = trans->block_rsv; trans->block_rsv = root->orphan_block_rsv; - ret = btrfs_block_rsv_check(root, root->orphan_block_rsv, 0, 5, 0); + ret = btrfs_block_rsv_check(root, root->orphan_block_rsv, 5); if (ret) return ret; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index b6b70bdd0992..f12747c9447b 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3604,7 +3604,7 @@ void btrfs_evict_inode(struct inode *inode) * doing the truncate. */ while (1) { - ret = btrfs_block_rsv_check(root, rsv, min_size, 0, 1); + ret = btrfs_block_rsv_refill(root, rsv, min_size); /* * Try and steal from the global reserve since we will @@ -6613,7 +6613,7 @@ static int btrfs_truncate(struct inode *inode) btrfs_add_ordered_operation(trans, root, inode); while (1) { - ret = btrfs_block_rsv_check(root, rsv, min_size, 0, 1); + ret = btrfs_block_rsv_refill(root, rsv, min_size); if (ret) { /* * This can only happen with the original transaction we diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 7fa090fa0d39..10af6a0e0865 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2041,8 +2041,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, BUG_ON(IS_ERR(trans)); trans->block_rsv = rc->block_rsv; - ret = btrfs_block_rsv_check(root, rc->block_rsv, min_reserved, - 0, 0); + ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved); if (ret) { BUG_ON(ret != -EAGAIN); ret = btrfs_commit_transaction(trans, root); @@ -3774,8 +3773,7 @@ restart: } } - ret = btrfs_block_rsv_check(rc->extent_root, rc->block_rsv, 0, - 5, 0); + ret = btrfs_block_rsv_check(rc->extent_root, rc->block_rsv, 5); if (ret < 0) { if (ret != -EAGAIN) { err = ret; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index d064fa0a4a07..29bef63e23ba 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -418,8 +418,8 @@ static int should_end_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root) { int ret; - ret = btrfs_block_rsv_check(root, &root->fs_info->global_block_rsv, 0, - 5, 0); + + ret = btrfs_block_rsv_check(root, &root->fs_info->global_block_rsv, 5); return ret ? 1 : 0; } -- cgit v1.2.3 From 7e355b83efa80e5f5821591c13c17649594d82ac Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 18 Oct 2011 13:07:31 -0400 Subject: Btrfs: if we have a lot of pinned space, commit the transaction Mitch kept hitting a panic because he was getting ENOSPC. One of my previous patches makes it so we are much better at not allocating new metadata chunks. Unfortunately coupled with the overcommit patch this works us into a bit of a problem if we are removing a bunch of space and end up chewing up all of our space with pinned extents. We can allocate chunks fine and overflow is ok, but the only way to reclaim this space is to commit the transaction. So if we go to overcommit, first check and see how much pinned space we have. If we have more than 80% of the free space chewed up with pinned extents, just commit the transaction, this will free up enough space for our reservation and we won't have this problem anymore. With this patch Mitch's test doesn't blow up anymore. Thanks, Reported-and-tested-by: Mitch Harder Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index a5f1421eeee9..4eb7d2ba38f8 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3510,6 +3510,20 @@ again: u64 profile = btrfs_get_alloc_profile(root, 0); u64 avail; + /* + * If we have a lot of space that's pinned, don't bother doing + * the overcommit dance yet and just commit the transaction. + */ + avail = (space_info->total_bytes - space_info->bytes_used) * 8; + do_div(avail, 10); + if (space_info->bytes_pinned >= avail && flush && !trans && + !committed) { + space_info->flush = 1; + flushing = true; + spin_unlock(&space_info->lock); + goto commit; + } + spin_lock(&root->fs_info->free_chunk_lock); avail = root->fs_info->free_chunk_space; @@ -3581,6 +3595,7 @@ again: if (trans) goto out; +commit: ret = -ENOSPC; if (committed) goto out; -- cgit v1.2.3 From 10b2f34d6e7fbe07f498cb2006272e9a561f5e60 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Sun, 2 Oct 2011 13:56:53 +0300 Subject: Btrfs: pass the correct root to lookup_free_space_inode() Free space items are located in tree of tree roots, not in the extent tree. It didn't pop up because lookup_free_space_inode() grabs the inode all the time instead of actually searching the tree. Signed-off-by: Ilya Dryomov --- fs/btrfs/extent-tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 4eb7d2ba38f8..6cfcc9060c83 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -7312,7 +7312,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, goto out; } - inode = lookup_free_space_inode(root, block_group, path); + inode = lookup_free_space_inode(tree_root, block_group, path); if (!IS_ERR(inode)) { ret = btrfs_orphan_add(trans, inode); BUG_ON(ret); -- cgit v1.2.3 From 60d2adbb1e7fee1cb4bc67f70bd0bd8ace7b6c3c Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Fri, 9 Sep 2011 17:34:35 +0800 Subject: Btrfs: fix race between multi-task space allocation and caching space The task may fail to get free space though it is enough when multi-task space allocation and caching space happen at the same time. Task1 Caching Thread Task2 ------------------------------------------------------------------------ find_free_extent The space has not be cached, and start caching thread. And wait for it. cache space, if the space is > 2MB wake up Task1 find_free_extent get all the space that is cached. try to allocate space, but there is no space now. trigger BUG_ON() The message is following: btrfs allocation failed flags 1, wanted 4096 space_info has 1040187392 free, is not full space_info total=1082130432, used=4096, pinned=41938944, reserved=0, may_use=40828928, readonly=0 block group 12582912 has 8388608 bytes, 0 used 8388608 pinned 0 reserved block group has cluster?: no 0 blocks of free space at or bigger than bytes is block group 1103101952 has 1073741824 bytes, 4096 used 33550336 pinned 0 reserved block group has cluster?: no 0 blocks of free space at or bigger than bytes is ------------[ cut here ]------------ kernel BUG at fs/btrfs/inode.c:835! [] __extent_writepage+0x1bf/0x5ce [btrfs] [] ? __set_page_dirty_nobuffers+0xfe/0x108 [] ? wait_current_trans+0x23/0xec [btrfs] [] ? find_get_pages_tag+0x73/0xe2 [] extent_write_cache_pages.clone.0+0x176/0x29a [btrfs] [] extent_writepages+0x3e/0x53 [btrfs] [] ? do_sync_write+0xc6/0x103 [] ? btrfs_submit_direct+0x414/0x414 [btrfs] [] ? fsnotify+0x236/0x266 [] btrfs_writepages+0x22/0x24 [btrfs] [] do_writepages+0x1c/0x25 [] __filemap_fdatawrite_range+0x4e/0x50 [] filemap_write_and_wait_range+0x28/0x51 [] btrfs_sync_file+0x7d/0x198 [btrfs] [] ? fsnotify_modify+0x5d/0x65 [] vfs_fsync_range+0x18/0x21 [] vfs_fsync+0x17/0x19 [] do_fsync+0x29/0x3e [] sys_fsync+0xb/0xf [] system_call_fastpath+0x16/0x1b [SNIP] RIP [] cow_file_range+0x1c4/0x32b [btrfs] We fix this bug by trying to allocate the space again if there are block groups in caching. Signed-off-by: Miao Xie --- fs/btrfs/extent-tree.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 6cfcc9060c83..cef355f1328a 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4954,6 +4954,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, bool failed_cluster_refill = false; bool failed_alloc = false; bool use_cluster = true; + bool have_caching_bg = false; u64 ideal_cache_percent = 0; u64 ideal_cache_offset = 0; @@ -5036,6 +5037,7 @@ ideal_cache: } } search: + have_caching_bg = false; down_read(&space_info->groups_sem); list_for_each_entry(block_group, &space_info->block_groups[index], list) { @@ -5244,6 +5246,8 @@ refill_cluster: failed_alloc = true; goto have_block_group; } else if (!offset) { + if (!cached) + have_caching_bg = true; goto loop; } checks: @@ -5294,6 +5298,9 @@ loop: } up_read(&space_info->groups_sem); + if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg) + goto search; + if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES) goto search; -- cgit v1.2.3 From dff51cd1c60856c28f5d22a571294c2b70b6b322 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 14 Jun 2011 12:52:17 +0200 Subject: btrfs: ratelimit WARN_ON in use_block_rsv The WARN_ON under some circumstances heavily polute log and slow down the machine. This is just a safety, as the warning should be fixed by another patch, nevertheless, it still pops up during testing. Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index cef355f1328a..28c4809851a5 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -23,6 +23,7 @@ #include #include #include +#include #include "compat.h" #include "hash.h" #include "ctree.h" @@ -5783,7 +5784,13 @@ use_block_rsv(struct btrfs_trans_handle *trans, if (!ret) return block_rsv; if (ret) { - WARN_ON(1); + static DEFINE_RATELIMIT_STATE(_rs, + DEFAULT_RATELIMIT_INTERVAL, + /*DEFAULT_RATELIMIT_BURST*/ 2); + if (__ratelimit(&_rs)) { + printk(KERN_DEBUG "btrfs: block rsv returned %d\n", ret); + WARN_ON(1); + } ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0); if (!ret) { return block_rsv; -- cgit v1.2.3 From 0e175a1835ffc979e55787774e58ec79e41957d7 Mon Sep 17 00:00:00 2001 From: Curt Wohlgemuth Date: Fri, 7 Oct 2011 21:54:10 -0600 Subject: writeback: Add a 'reason' to wb_writeback_work This creates a new 'reason' field in a wb_writeback_work structure, which unambiguously identifies who initiates writeback activity. A 'wb_reason' enumeration has been added to writeback.h, to enumerate the possible reasons. The 'writeback_work_class' and tracepoint event class and 'writeback_queue_io' tracepoints are updated to include the symbolic 'reason' in all trace events. And the 'writeback_inodes_sbXXX' family of routines has had a wb_stats parameter added to them, so callers can specify why writeback is being started. Acked-by: Jan Kara Signed-off-by: Curt Wohlgemuth Signed-off-by: Wu Fengguang --- fs/btrfs/extent-tree.c | 3 ++- fs/buffer.c | 2 +- fs/ext4/inode.c | 2 +- fs/fs-writeback.c | 49 +++++++++++++++++++++++++++++----------- fs/quota/quota.c | 2 +- fs/sync.c | 4 ++-- fs/ubifs/budget.c | 2 +- include/linux/backing-dev.h | 3 ++- include/linux/writeback.h | 32 +++++++++++++++++++++----- include/trace/events/writeback.h | 14 ++++++++---- mm/backing-dev.c | 3 ++- mm/page-writeback.c | 3 ++- mm/vmscan.c | 3 ++- 13 files changed, 88 insertions(+), 34 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index f5be06a2462f..c9ee0e18bbdc 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3340,7 +3340,8 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, smp_mb(); nr_pages = min_t(unsigned long, nr_pages, root->fs_info->delalloc_bytes >> PAGE_CACHE_SHIFT); - writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages); + writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages, + WB_REASON_FS_FREE_SPACE); spin_lock(&space_info->lock); if (reserved > space_info->bytes_reserved) diff --git a/fs/buffer.c b/fs/buffer.c index 1a80b048ade8..f5dcee6c4cfb 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -285,7 +285,7 @@ static void free_more_memory(void) struct zone *zone; int nid; - wakeup_flusher_threads(1024); + wakeup_flusher_threads(1024, WB_REASON_FREE_MORE_MEM); yield(); for_each_online_node(nid) { diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 986e2388f031..7fa73a3b2120 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2241,7 +2241,7 @@ static int ext4_nonda_switch(struct super_block *sb) * start pushing delalloc when 1/2 of free blocks are dirty. */ if (free_blocks < 2 * dirty_blocks) - writeback_inodes_sb_if_idle(sb); + writeback_inodes_sb_if_idle(sb, WB_REASON_FS_FREE_SPACE); return 0; } diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index c51029693600..73c3992b2bb4 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -41,11 +41,23 @@ struct wb_writeback_work { unsigned int for_kupdate:1; unsigned int range_cyclic:1; unsigned int for_background:1; + enum wb_reason reason; /* why was writeback initiated? */ struct list_head list; /* pending work list */ struct completion *done; /* set if the caller waits */ }; +const char *wb_reason_name[] = { + [WB_REASON_BACKGROUND] = "background", + [WB_REASON_TRY_TO_FREE_PAGES] = "try_to_free_pages", + [WB_REASON_SYNC] = "sync", + [WB_REASON_PERIODIC] = "periodic", + [WB_REASON_LAPTOP_TIMER] = "laptop_timer", + [WB_REASON_FREE_MORE_MEM] = "free_more_memory", + [WB_REASON_FS_FREE_SPACE] = "fs_free_space", + [WB_REASON_FORKER_THREAD] = "forker_thread" +}; + /* * Include the creation of the trace points after defining the * wb_writeback_work structure so that the definition remains local to this @@ -115,7 +127,7 @@ static void bdi_queue_work(struct backing_dev_info *bdi, static void __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, - bool range_cyclic) + bool range_cyclic, enum wb_reason reason) { struct wb_writeback_work *work; @@ -135,6 +147,7 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, work->sync_mode = WB_SYNC_NONE; work->nr_pages = nr_pages; work->range_cyclic = range_cyclic; + work->reason = reason; bdi_queue_work(bdi, work); } @@ -150,9 +163,10 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, * completion. Caller need not hold sb s_umount semaphore. * */ -void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages) +void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, + enum wb_reason reason) { - __bdi_start_writeback(bdi, nr_pages, true); + __bdi_start_writeback(bdi, nr_pages, true, reason); } /** @@ -641,12 +655,14 @@ static long __writeback_inodes_wb(struct bdi_writeback *wb, return wrote; } -long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages) +long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages, + enum wb_reason reason) { struct wb_writeback_work work = { .nr_pages = nr_pages, .sync_mode = WB_SYNC_NONE, .range_cyclic = 1, + .reason = reason, }; spin_lock(&wb->list_lock); @@ -825,6 +841,7 @@ static long wb_check_background_flush(struct bdi_writeback *wb) .sync_mode = WB_SYNC_NONE, .for_background = 1, .range_cyclic = 1, + .reason = WB_REASON_BACKGROUND, }; return wb_writeback(wb, &work); @@ -858,6 +875,7 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb) .sync_mode = WB_SYNC_NONE, .for_kupdate = 1, .range_cyclic = 1, + .reason = WB_REASON_PERIODIC, }; return wb_writeback(wb, &work); @@ -976,7 +994,7 @@ int bdi_writeback_thread(void *data) * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back * the whole world. */ -void wakeup_flusher_threads(long nr_pages) +void wakeup_flusher_threads(long nr_pages, enum wb_reason reason) { struct backing_dev_info *bdi; @@ -989,7 +1007,7 @@ void wakeup_flusher_threads(long nr_pages) list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) { if (!bdi_has_dirty_io(bdi)) continue; - __bdi_start_writeback(bdi, nr_pages, false); + __bdi_start_writeback(bdi, nr_pages, false, reason); } rcu_read_unlock(); } @@ -1210,7 +1228,9 @@ static void wait_sb_inodes(struct super_block *sb) * on how many (if any) will be written, and this function does not wait * for IO completion of submitted IO. */ -void writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr) +void writeback_inodes_sb_nr(struct super_block *sb, + unsigned long nr, + enum wb_reason reason) { DECLARE_COMPLETION_ONSTACK(done); struct wb_writeback_work work = { @@ -1219,6 +1239,7 @@ void writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr) .tagged_writepages = 1, .done = &done, .nr_pages = nr, + .reason = reason, }; WARN_ON(!rwsem_is_locked(&sb->s_umount)); @@ -1235,9 +1256,9 @@ EXPORT_SYMBOL(writeback_inodes_sb_nr); * on how many (if any) will be written, and this function does not wait * for IO completion of submitted IO. */ -void writeback_inodes_sb(struct super_block *sb) +void writeback_inodes_sb(struct super_block *sb, enum wb_reason reason) { - return writeback_inodes_sb_nr(sb, get_nr_dirty_pages()); + return writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason); } EXPORT_SYMBOL(writeback_inodes_sb); @@ -1248,11 +1269,11 @@ EXPORT_SYMBOL(writeback_inodes_sb); * Invoke writeback_inodes_sb if no writeback is currently underway. * Returns 1 if writeback was started, 0 if not. */ -int writeback_inodes_sb_if_idle(struct super_block *sb) +int writeback_inodes_sb_if_idle(struct super_block *sb, enum wb_reason reason) { if (!writeback_in_progress(sb->s_bdi)) { down_read(&sb->s_umount); - writeback_inodes_sb(sb); + writeback_inodes_sb(sb, reason); up_read(&sb->s_umount); return 1; } else @@ -1269,11 +1290,12 @@ EXPORT_SYMBOL(writeback_inodes_sb_if_idle); * Returns 1 if writeback was started, 0 if not. */ int writeback_inodes_sb_nr_if_idle(struct super_block *sb, - unsigned long nr) + unsigned long nr, + enum wb_reason reason) { if (!writeback_in_progress(sb->s_bdi)) { down_read(&sb->s_umount); - writeback_inodes_sb_nr(sb, nr); + writeback_inodes_sb_nr(sb, nr, reason); up_read(&sb->s_umount); return 1; } else @@ -1297,6 +1319,7 @@ void sync_inodes_sb(struct super_block *sb) .nr_pages = LONG_MAX, .range_cyclic = 0, .done = &done, + .reason = WB_REASON_SYNC, }; WARN_ON(!rwsem_is_locked(&sb->s_umount)); diff --git a/fs/quota/quota.c b/fs/quota/quota.c index 10b6be3ca280..4bae57fc603b 100644 --- a/fs/quota/quota.c +++ b/fs/quota/quota.c @@ -286,7 +286,7 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, /* caller already holds s_umount */ if (sb->s_flags & MS_RDONLY) return -EROFS; - writeback_inodes_sb(sb); + writeback_inodes_sb(sb, WB_REASON_SYNC); return 0; default: return -EINVAL; diff --git a/fs/sync.c b/fs/sync.c index c98a7477edfd..101b8ef901d7 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -43,7 +43,7 @@ static int __sync_filesystem(struct super_block *sb, int wait) if (wait) sync_inodes_sb(sb); else - writeback_inodes_sb(sb); + writeback_inodes_sb(sb, WB_REASON_SYNC); if (sb->s_op->sync_fs) sb->s_op->sync_fs(sb, wait); @@ -98,7 +98,7 @@ static void sync_filesystems(int wait) */ SYSCALL_DEFINE0(sync) { - wakeup_flusher_threads(0); + wakeup_flusher_threads(0, WB_REASON_SYNC); sync_filesystems(0); sync_filesystems(1); if (unlikely(laptop_mode)) diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c index 315de66e52b2..bc4f94b28706 100644 --- a/fs/ubifs/budget.c +++ b/fs/ubifs/budget.c @@ -63,7 +63,7 @@ static void shrink_liability(struct ubifs_info *c, int nr_to_write) { down_read(&c->vfs_sb->s_umount); - writeback_inodes_sb(c->vfs_sb); + writeback_inodes_sb(c->vfs_sb, WB_REASON_FS_FREE_SPACE); up_read(&c->vfs_sb->s_umount); } diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index c3b92010d894..b1038bd686ac 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -118,7 +118,8 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent, int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev); void bdi_unregister(struct backing_dev_info *bdi); int bdi_setup_and_register(struct backing_dev_info *, char *, unsigned int); -void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages); +void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, + enum wb_reason reason); void bdi_start_background_writeback(struct backing_dev_info *bdi); int bdi_writeback_thread(void *data); int bdi_has_dirty_io(struct backing_dev_info *bdi); diff --git a/include/linux/writeback.h b/include/linux/writeback.h index ddb4652cb337..a378c295851f 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -38,6 +38,23 @@ enum writeback_sync_modes { WB_SYNC_ALL, /* Wait on every mapping */ }; +/* + * why some writeback work was initiated + */ +enum wb_reason { + WB_REASON_BACKGROUND, + WB_REASON_TRY_TO_FREE_PAGES, + WB_REASON_SYNC, + WB_REASON_PERIODIC, + WB_REASON_LAPTOP_TIMER, + WB_REASON_FREE_MORE_MEM, + WB_REASON_FS_FREE_SPACE, + WB_REASON_FORKER_THREAD, + + WB_REASON_MAX, +}; +extern const char *wb_reason_name[]; + /* * A control structure which tells the writeback code what to do. These are * always on the stack, and hence need no locking. They are always initialised @@ -69,14 +86,17 @@ struct writeback_control { */ struct bdi_writeback; int inode_wait(void *); -void writeback_inodes_sb(struct super_block *); -void writeback_inodes_sb_nr(struct super_block *, unsigned long nr); -int writeback_inodes_sb_if_idle(struct super_block *); -int writeback_inodes_sb_nr_if_idle(struct super_block *, unsigned long nr); +void writeback_inodes_sb(struct super_block *, enum wb_reason reason); +void writeback_inodes_sb_nr(struct super_block *, unsigned long nr, + enum wb_reason reason); +int writeback_inodes_sb_if_idle(struct super_block *, enum wb_reason reason); +int writeback_inodes_sb_nr_if_idle(struct super_block *, unsigned long nr, + enum wb_reason reason); void sync_inodes_sb(struct super_block *); -long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages); +long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages, + enum wb_reason reason); long wb_do_writeback(struct bdi_writeback *wb, int force_wait); -void wakeup_flusher_threads(long nr_pages); +void wakeup_flusher_threads(long nr_pages, enum wb_reason reason); /* writeback.h requires fs.h; it, too, is not included from here. */ static inline void wait_on_inode(struct inode *inode) diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index 1261db3916cc..b99caa8b780c 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -34,6 +34,7 @@ DECLARE_EVENT_CLASS(writeback_work_class, __field(int, for_kupdate) __field(int, range_cyclic) __field(int, for_background) + __field(int, reason) ), TP_fast_assign( strncpy(__entry->name, dev_name(bdi->dev), 32); @@ -43,16 +44,18 @@ DECLARE_EVENT_CLASS(writeback_work_class, __entry->for_kupdate = work->for_kupdate; __entry->range_cyclic = work->range_cyclic; __entry->for_background = work->for_background; + __entry->reason = work->reason; ), TP_printk("bdi %s: sb_dev %d:%d nr_pages=%ld sync_mode=%d " - "kupdate=%d range_cyclic=%d background=%d", + "kupdate=%d range_cyclic=%d background=%d reason=%s", __entry->name, MAJOR(__entry->sb_dev), MINOR(__entry->sb_dev), __entry->nr_pages, __entry->sync_mode, __entry->for_kupdate, __entry->range_cyclic, - __entry->for_background + __entry->for_background, + wb_reason_name[__entry->reason] ) ); #define DEFINE_WRITEBACK_WORK_EVENT(name) \ @@ -165,6 +168,7 @@ TRACE_EVENT(writeback_queue_io, __field(unsigned long, older) __field(long, age) __field(int, moved) + __field(int, reason) ), TP_fast_assign( unsigned long *older_than_this = work->older_than_this; @@ -173,12 +177,14 @@ TRACE_EVENT(writeback_queue_io, __entry->age = older_than_this ? (jiffies - *older_than_this) * 1000 / HZ : -1; __entry->moved = moved; + __entry->reason = work->reason; ), - TP_printk("bdi %s: older=%lu age=%ld enqueue=%d", + TP_printk("bdi %s: older=%lu age=%ld enqueue=%d reason=%s", __entry->name, __entry->older, /* older_than_this in jiffies */ __entry->age, /* older_than_this in relative milliseconds */ - __entry->moved) + __entry->moved, + wb_reason_name[__entry->reason]) ); TRACE_EVENT(global_dirty_state, diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 5dcaa3c756d1..dd8916feb05e 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -476,7 +476,8 @@ static int bdi_forker_thread(void *ptr) * the bdi from the thread. Hopefully 1024 is * large enough for efficient IO. */ - writeback_inodes_wb(&bdi->wb, 1024); + writeback_inodes_wb(&bdi->wb, 1024, + WB_REASON_FORKER_THREAD); } else { /* * The spinlock makes sure we do not lose diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 45d36f7dc169..650846b61584 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1301,7 +1301,8 @@ void laptop_mode_timer_fn(unsigned long data) * threshold */ if (bdi_has_dirty_io(&q->backing_dev_info)) - bdi_start_writeback(&q->backing_dev_info, nr_pages); + bdi_start_writeback(&q->backing_dev_info, nr_pages, + WB_REASON_LAPTOP_TIMER); } /* diff --git a/mm/vmscan.c b/mm/vmscan.c index b55699cd9067..c735bd770d3d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2181,7 +2181,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, */ writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2; if (total_scanned > writeback_threshold) { - wakeup_flusher_threads(laptop_mode ? 0 : total_scanned); + wakeup_flusher_threads(laptop_mode ? 0 : total_scanned, + WB_REASON_TRY_TO_FREE_PAGES); sc->may_writepage = 1; } -- cgit v1.2.3 From e688b7252f784c2479d559f9f70ca8354752c5e7 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 31 Oct 2011 20:52:39 -0400 Subject: Btrfs: fix extent pinning bugs in the tree log The tree log had two important bugs that could cause corruptions after a crash. Sometimes we were allowing tree log blocks to be reused after the tree log was committed but before the transaction commit was done. This allowed a future metadata write to overwrite the tree log data. It is fixed by adding a new variant of freeing reserved extents that always pins them. Credit goes to Stefan Behrens and Arne Jansen for many many hours spent tracking this bug down. During tree log replay, we do a pass through the tree log and pin all the extents we find. This makes sure the replay code won't go in and use any of those blocks for new allocations during replay. The problem is the free space cache isn't honoring these pinned extents. So the allocator can end up handing them out, leading to all kinds of problems during replay. The fix here is to force any free space cache to load while we pin the extents, and then to make sure we remove the pinned extents from the free space rbtree. Signed-off-by: Chris Mason Reported-by: Stefan Behrens --- fs/btrfs/ctree.h | 5 +++++ fs/btrfs/extent-tree.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++--- fs/btrfs/tree-log.c | 11 ++++++----- 3 files changed, 59 insertions(+), 8 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 227620993bce..f63c9b3f6e08 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2156,6 +2156,9 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, u64 num_bytes, u64 *refs, u64 *flags); int btrfs_pin_extent(struct btrfs_root *root, u64 bytenr, u64 num, int reserved); +int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes); int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 objectid, u64 offset, u64 bytenr); @@ -2206,6 +2209,8 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, u64 root_objectid, u64 owner, u64 offset); int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len); +int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root, + u64 start, u64 len); int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 28c4809851a5..cb7626646bba 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4344,6 +4344,34 @@ int btrfs_pin_extent(struct btrfs_root *root, return 0; } +/* + * this function must be called within transaction + */ +int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes) +{ + struct btrfs_block_group_cache *cache; + + cache = btrfs_lookup_block_group(root->fs_info, bytenr); + BUG_ON(!cache); + + /* + * pull in the free space cache (if any) so that our pin + * removes the free space from the cache. We have load_only set + * to one because the slow code to read in the free extents does check + * the pinned extents. + */ + cache_block_group(cache, trans, root, 1); + + pin_down_extent(root, cache, bytenr, num_bytes, 0); + + /* remove us from the free space cache (if we're there at all) */ + btrfs_remove_free_space(cache, bytenr, num_bytes); + btrfs_put_block_group(cache); + return 0; +} + /** * btrfs_update_reserved_bytes - update the block_group and space info counters * @cache: The cache we are manipulating @@ -5487,7 +5515,8 @@ again: return ret; } -int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len) +static int __btrfs_free_reserved_extent(struct btrfs_root *root, + u64 start, u64 len, int pin) { struct btrfs_block_group_cache *cache; int ret = 0; @@ -5502,8 +5531,12 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len) if (btrfs_test_opt(root, DISCARD)) ret = btrfs_discard_extent(root, start, len, NULL); - btrfs_add_free_space(cache, start, len); - btrfs_update_reserved_bytes(cache, len, RESERVE_FREE); + if (pin) + pin_down_extent(root, cache, start, len, 1); + else { + btrfs_add_free_space(cache, start, len); + btrfs_update_reserved_bytes(cache, len, RESERVE_FREE); + } btrfs_put_block_group(cache); trace_btrfs_reserved_extent_free(root, start, len); @@ -5511,6 +5544,18 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len) return ret; } +int btrfs_free_reserved_extent(struct btrfs_root *root, + u64 start, u64 len) +{ + return __btrfs_free_reserved_extent(root, start, len, 0); +} + +int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root, + u64 start, u64 len) +{ + return __btrfs_free_reserved_extent(root, start, len, 1); +} + static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 parent, u64 root_objectid, diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 310ab22cfe58..8ca1b6b83bd1 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -276,8 +276,9 @@ static int process_one_buffer(struct btrfs_root *log, struct walk_control *wc, u64 gen) { if (wc->pin) - btrfs_pin_extent(log->fs_info->extent_root, - eb->start, eb->len, 0); + btrfs_pin_extent_for_log_replay(wc->trans, + log->fs_info->extent_root, + eb->start, eb->len); if (btrfs_buffer_uptodate(eb, gen)) { if (wc->write) @@ -1760,7 +1761,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); - ret = btrfs_free_reserved_extent(root, + ret = btrfs_free_and_pin_reserved_extent(root, bytenr, blocksize); BUG_ON(ret); } @@ -1828,7 +1829,7 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, btrfs_tree_unlock(next); WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); - ret = btrfs_free_reserved_extent(root, + ret = btrfs_free_and_pin_reserved_extent(root, path->nodes[*level]->start, path->nodes[*level]->len); BUG_ON(ret); @@ -1897,7 +1898,7 @@ static int walk_log_tree(struct btrfs_trans_handle *trans, WARN_ON(log->root_key.objectid != BTRFS_TREE_LOG_OBJECTID); - ret = btrfs_free_reserved_extent(log, next->start, + ret = btrfs_free_and_pin_reserved_extent(log, next->start, next->len); BUG_ON(ret); } -- cgit v1.2.3 From 6c41761fc6efe1503103a1afe03a6635c0b5d4ec Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 13 Apr 2011 15:41:04 +0200 Subject: btrfs: separate superblock items out of fs_info fs_info has now ~9kb, more than fits into one page. This will cause mount failure when memory is too fragmented. Top space consumers are super block structures super_copy and super_for_commit, ~2.8kb each. Allocate them dynamically. fs_info will be ~3.5kb. (measured on x86_64) Add a wrapper for freeing fs_info and all of it's dynamically allocated members. Signed-off-by: David Sterba --- fs/btrfs/compression.c | 3 ++- fs/btrfs/ctree.h | 16 ++++++++++++++-- fs/btrfs/disk-io.c | 33 ++++++++++----------------------- fs/btrfs/extent-tree.c | 16 ++++++++-------- fs/btrfs/file-item.c | 17 ++++++----------- fs/btrfs/inode.c | 2 +- fs/btrfs/ioctl.c | 8 ++++---- fs/btrfs/scrub.c | 2 +- fs/btrfs/super.c | 19 +++++++++++++------ fs/btrfs/transaction.c | 10 +++++----- fs/btrfs/tree-log.c | 4 ++-- fs/btrfs/volumes.c | 24 ++++++++++++------------ 12 files changed, 78 insertions(+), 76 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 8ec5d86f1734..14f1c5a0b2d2 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -85,7 +85,8 @@ struct compressed_bio { static inline int compressed_bio_size(struct btrfs_root *root, unsigned long disk_size) { - u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy); + u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); + return sizeof(struct compressed_bio) + ((disk_size + root->sectorsize - 1) / root->sectorsize) * csum_size; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index f63c9b3f6e08..5181c53c1124 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -936,8 +936,8 @@ struct btrfs_fs_info { wait_queue_head_t transaction_blocked_wait; wait_queue_head_t async_submit_wait; - struct btrfs_super_block super_copy; - struct btrfs_super_block super_for_commit; + struct btrfs_super_block *super_copy; + struct btrfs_super_block *super_for_commit; struct block_device *__bdev; struct super_block *sb; struct inode *btree_inode; @@ -2387,6 +2387,18 @@ static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info) smp_mb(); return fs_info->closing; } +static inline void free_fs_info(struct btrfs_fs_info *fs_info) +{ + kfree(fs_info->delayed_root); + kfree(fs_info->extent_root); + kfree(fs_info->tree_root); + kfree(fs_info->chunk_root); + kfree(fs_info->dev_root); + kfree(fs_info->csum_root); + kfree(fs_info->super_copy); + kfree(fs_info->super_for_commit); + kfree(fs_info); +} /* root-item.c */ int btrfs_find_root_ref(struct btrfs_root *tree_root, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 6f58911ece0d..761717c98278 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -256,8 +256,7 @@ void btrfs_csum_final(u32 crc, char *result) static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf, int verify) { - u16 csum_size = - btrfs_super_csum_size(&root->fs_info->super_copy); + u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); char *result = NULL; unsigned long len; unsigned long cur_len; @@ -1766,14 +1765,14 @@ struct btrfs_root *open_ctree(struct super_block *sb, goto fail_alloc; } - memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy)); - memcpy(&fs_info->super_for_commit, &fs_info->super_copy, - sizeof(fs_info->super_for_commit)); + memcpy(fs_info->super_copy, bh->b_data, sizeof(*fs_info->super_copy)); + memcpy(fs_info->super_for_commit, fs_info->super_copy, + sizeof(*fs_info->super_for_commit)); brelse(bh); - memcpy(fs_info->fsid, fs_info->super_copy.fsid, BTRFS_FSID_SIZE); + memcpy(fs_info->fsid, fs_info->super_copy->fsid, BTRFS_FSID_SIZE); - disk_super = &fs_info->super_copy; + disk_super = fs_info->super_copy; if (!btrfs_super_root(disk_super)) goto fail_alloc; @@ -2152,7 +2151,6 @@ fail_sb_buffer: btrfs_stop_workers(&fs_info->delayed_workers); btrfs_stop_workers(&fs_info->caching_workers); fail_alloc: - kfree(fs_info->delayed_root); fail_iput: invalidate_inode_pages2(fs_info->btree_inode->i_mapping); iput(fs_info->btree_inode); @@ -2164,12 +2162,7 @@ fail_bdi: fail_srcu: cleanup_srcu_struct(&fs_info->subvol_srcu); fail: - kfree(extent_root); - kfree(tree_root); - kfree(fs_info); - kfree(chunk_root); - kfree(dev_root); - kfree(csum_root); + free_fs_info(fs_info); return ERR_PTR(err); } @@ -2338,10 +2331,10 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors) int total_errors = 0; u64 flags; - max_errors = btrfs_super_num_devices(&root->fs_info->super_copy) - 1; + max_errors = btrfs_super_num_devices(root->fs_info->super_copy) - 1; do_barriers = !btrfs_test_opt(root, NOBARRIER); - sb = &root->fs_info->super_for_commit; + sb = root->fs_info->super_for_commit; dev_item = &sb->dev_item; mutex_lock(&root->fs_info->fs_devices->device_list_mutex); @@ -2603,7 +2596,6 @@ int close_ctree(struct btrfs_root *root) del_fs_roots(fs_info); iput(fs_info->btree_inode); - kfree(fs_info->delayed_root); btrfs_stop_workers(&fs_info->generic_worker); btrfs_stop_workers(&fs_info->fixup_workers); @@ -2624,12 +2616,7 @@ int close_ctree(struct btrfs_root *root) bdi_destroy(&fs_info->bdi); cleanup_srcu_struct(&fs_info->subvol_srcu); - kfree(fs_info->extent_root); - kfree(fs_info->tree_root); - kfree(fs_info->chunk_root); - kfree(fs_info->dev_root); - kfree(fs_info->csum_root); - kfree(fs_info); + free_fs_info(fs_info); return 0; } diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index cb7626646bba..782eb3ea8edf 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3209,7 +3209,7 @@ static int should_alloc_chunk(struct btrfs_root *root, * about 1% of the FS size. */ if (force == CHUNK_ALLOC_LIMITED) { - thresh = btrfs_super_total_bytes(&root->fs_info->super_copy); + thresh = btrfs_super_total_bytes(root->fs_info->super_copy); thresh = max_t(u64, 64 * 1024 * 1024, div_factor_fine(thresh, 1)); @@ -3231,7 +3231,7 @@ static int should_alloc_chunk(struct btrfs_root *root, if (num_allocated + alloc_bytes < div_factor(num_bytes, 8)) return 0; - thresh = btrfs_super_total_bytes(&root->fs_info->super_copy); + thresh = btrfs_super_total_bytes(root->fs_info->super_copy); /* 256MB or 5% of the FS */ thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5)); @@ -3843,7 +3843,7 @@ static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info) u64 num_bytes; u64 meta_used; u64 data_used; - int csum_size = btrfs_super_csum_size(&fs_info->super_copy); + int csum_size = btrfs_super_csum_size(fs_info->super_copy); sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA); spin_lock(&sinfo->lock); @@ -4222,12 +4222,12 @@ static int update_block_group(struct btrfs_trans_handle *trans, /* block accounting for super block */ spin_lock(&info->delalloc_lock); - old_val = btrfs_super_bytes_used(&info->super_copy); + old_val = btrfs_super_bytes_used(info->super_copy); if (alloc) old_val += num_bytes; else old_val -= num_bytes; - btrfs_set_super_bytes_used(&info->super_copy, old_val); + btrfs_set_super_bytes_used(info->super_copy, old_val); spin_unlock(&info->delalloc_lock); while (total) { @@ -7127,9 +7127,9 @@ int btrfs_read_block_groups(struct btrfs_root *root) return -ENOMEM; path->reada = 1; - cache_gen = btrfs_super_cache_generation(&root->fs_info->super_copy); + cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy); if (btrfs_test_opt(root, SPACE_CACHE) && - btrfs_super_generation(&root->fs_info->super_copy) != cache_gen) + btrfs_super_generation(root->fs_info->super_copy) != cache_gen) need_clear = 1; if (btrfs_test_opt(root, CLEAR_CACHE)) need_clear = 1; @@ -7458,7 +7458,7 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info) int mixed = 0; int ret; - disk_super = &fs_info->super_copy; + disk_super = fs_info->super_copy; if (!btrfs_super_root(disk_super)) return 1; diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index a1cb7821becd..c7fb3a4247d3 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -91,8 +91,7 @@ struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans, struct btrfs_csum_item *item; struct extent_buffer *leaf; u64 csum_offset = 0; - u16 csum_size = - btrfs_super_csum_size(&root->fs_info->super_copy); + u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); int csums_in_item; file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; @@ -162,8 +161,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root, u64 item_last_offset = 0; u64 disk_bytenr; u32 diff; - u16 csum_size = - btrfs_super_csum_size(&root->fs_info->super_copy); + u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); int ret; struct btrfs_path *path; struct btrfs_csum_item *item = NULL; @@ -290,7 +288,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, int ret; size_t size; u64 csum_end; - u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy); + u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); path = btrfs_alloc_path(); if (!path) @@ -492,8 +490,7 @@ static noinline int truncate_one_csum(struct btrfs_trans_handle *trans, u64 bytenr, u64 len) { struct extent_buffer *leaf; - u16 csum_size = - btrfs_super_csum_size(&root->fs_info->super_copy); + u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); u64 csum_end; u64 end_byte = bytenr + len; u32 blocksize_bits = root->fs_info->sb->s_blocksize_bits; @@ -549,8 +546,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, u64 csum_end; struct extent_buffer *leaf; int ret; - u16 csum_size = - btrfs_super_csum_size(&root->fs_info->super_copy); + u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); int blocksize_bits = root->fs_info->sb->s_blocksize_bits; root = root->fs_info->csum_root; @@ -676,8 +672,7 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, struct btrfs_sector_sum *sector_sum; u32 nritems; u32 ins_size; - u16 csum_size = - btrfs_super_csum_size(&root->fs_info->super_copy); + u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); path = btrfs_alloc_path(); if (!path) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a8586e10953c..b6b612e14ed7 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -824,7 +824,7 @@ static noinline int cow_file_range(struct inode *inode, } BUG_ON(disk_num_bytes > - btrfs_super_total_bytes(&root->fs_info->super_copy)); + btrfs_super_total_bytes(root->fs_info->super_copy)); alloc_hint = get_extent_allocation_hint(inode, start, num_bytes); btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 33aae13cc74b..8f6e14279409 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -282,7 +282,7 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg) struct fstrim_range range; u64 minlen = ULLONG_MAX; u64 num_devices = 0; - u64 total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy); + u64 total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy); int ret; if (!capable(CAP_SYS_ADMIN)) @@ -1164,7 +1164,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, mutex_unlock(&inode->i_mutex); } - disk_super = &root->fs_info->super_copy; + disk_super = root->fs_info->super_copy; features = btrfs_super_incompat_flags(disk_super); if (range->compress_type == BTRFS_COMPRESS_LZO) { features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO; @@ -2613,7 +2613,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) return PTR_ERR(trans); } - dir_id = btrfs_super_root_dir(&root->fs_info->super_copy); + dir_id = btrfs_super_root_dir(root->fs_info->super_copy); di = btrfs_lookup_dir_item(trans, root->fs_info->tree_root, path, dir_id, "default", 7, 1); if (IS_ERR_OR_NULL(di)) { @@ -2629,7 +2629,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) btrfs_mark_buffer_dirty(path->nodes[0]); btrfs_free_path(path); - disk_super = &root->fs_info->super_copy; + disk_super = root->fs_info->super_copy; features = btrfs_super_incompat_flags(disk_super); if (!(features & BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL)) { features |= BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index a8d03d5efb5d..69a600f07763 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -182,7 +182,7 @@ struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev) sdev->curr = -1; atomic_set(&sdev->in_flight, 0); atomic_set(&sdev->cancel_req, 0); - sdev->csum_size = btrfs_super_csum_size(&fs_info->super_copy); + sdev->csum_size = btrfs_super_csum_size(fs_info->super_copy); INIT_LIST_HEAD(&sdev->csum_list); spin_lock_init(&sdev->list_lock); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 5429b1fa0bfc..f7e9de724ef2 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -216,7 +216,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) char *compress_type; bool compress_force = false; - cache_gen = btrfs_super_cache_generation(&root->fs_info->super_copy); + cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy); if (cache_gen) btrfs_set_opt(info->mount_opt, SPACE_CACHE); @@ -524,7 +524,7 @@ static struct dentry *get_default_root(struct super_block *sb, * will mount by default if we haven't been given a specific subvolume * to mount. */ - dir_id = btrfs_super_root_dir(&root->fs_info->super_copy); + dir_id = btrfs_super_root_dir(root->fs_info->super_copy); di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0); if (IS_ERR(di)) { btrfs_free_path(path); @@ -937,6 +937,13 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, fs_info->fs_devices = fs_devices; tree_root->fs_info = fs_info; + fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS); + fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS); + if (!fs_info->super_copy || !fs_info->super_for_commit) { + error = -ENOMEM; + goto error_close_devices; + } + bdev = fs_devices->latest_bdev; s = sget(fs_type, btrfs_test_super, btrfs_set_super, tree_root); if (IS_ERR(s)) { @@ -951,7 +958,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, } btrfs_close_devices(fs_devices); - kfree(fs_info); + free_fs_info(fs_info); kfree(tree_root); } else { char b[BDEVNAME_SIZE]; @@ -979,7 +986,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, error_close_devices: btrfs_close_devices(fs_devices); - kfree(fs_info); + free_fs_info(fs_info); kfree(tree_root); return ERR_PTR(error); } @@ -1005,7 +1012,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) if (root->fs_info->fs_devices->rw_devices == 0) return -EACCES; - if (btrfs_super_log_root(&root->fs_info->super_copy) != 0) + if (btrfs_super_log_root(root->fs_info->super_copy) != 0) return -EINVAL; ret = btrfs_cleanup_fs_roots(root->fs_info); @@ -1171,7 +1178,7 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct btrfs_root *root = btrfs_sb(dentry->d_sb); - struct btrfs_super_block *disk_super = &root->fs_info->super_copy; + struct btrfs_super_block *disk_super = root->fs_info->super_copy; struct list_head *head = &root->fs_info->space_info; struct btrfs_space_info *found; u64 total_used = 0; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 29bef63e23ba..373c7ec1a026 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -991,7 +991,7 @@ static void update_super_roots(struct btrfs_root *root) struct btrfs_root_item *root_item; struct btrfs_super_block *super; - super = &root->fs_info->super_copy; + super = root->fs_info->super_copy; root_item = &root->fs_info->chunk_root->root_item; super->chunk_root = root_item->bytenr; @@ -1301,12 +1301,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, update_super_roots(root); if (!root->fs_info->log_root_recovering) { - btrfs_set_super_log_root(&root->fs_info->super_copy, 0); - btrfs_set_super_log_root_level(&root->fs_info->super_copy, 0); + btrfs_set_super_log_root(root->fs_info->super_copy, 0); + btrfs_set_super_log_root_level(root->fs_info->super_copy, 0); } - memcpy(&root->fs_info->super_for_commit, &root->fs_info->super_copy, - sizeof(root->fs_info->super_copy)); + memcpy(root->fs_info->super_for_commit, root->fs_info->super_copy, + sizeof(*root->fs_info->super_copy)); trans->transaction->blocked = 0; spin_lock(&root->fs_info->trans_lock); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 8ca1b6b83bd1..f4d81c06d48f 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2118,9 +2118,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, BUG_ON(ret); btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); - btrfs_set_super_log_root(&root->fs_info->super_for_commit, + btrfs_set_super_log_root(root->fs_info->super_for_commit, log_root_tree->node->start); - btrfs_set_super_log_root_level(&root->fs_info->super_for_commit, + btrfs_set_super_log_root_level(root->fs_info->super_for_commit, btrfs_header_level(log_root_tree->node)); log_root_tree->log_batch = 0; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index c6938b45e0fd..c3b45564048e 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1395,8 +1395,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) call_rcu(&device->rcu, free_device); mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); - num_devices = btrfs_super_num_devices(&root->fs_info->super_copy) - 1; - btrfs_set_super_num_devices(&root->fs_info->super_copy, num_devices); + num_devices = btrfs_super_num_devices(root->fs_info->super_copy) - 1; + btrfs_set_super_num_devices(root->fs_info->super_copy, num_devices); if (cur_devices->open_devices == 0) { struct btrfs_fs_devices *fs_devices; @@ -1458,7 +1458,7 @@ static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans, struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; struct btrfs_fs_devices *old_devices; struct btrfs_fs_devices *seed_devices; - struct btrfs_super_block *disk_super = &root->fs_info->super_copy; + struct btrfs_super_block *disk_super = root->fs_info->super_copy; struct btrfs_device *device; u64 super_flags; @@ -1706,12 +1706,12 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) if (!blk_queue_nonrot(bdev_get_queue(bdev))) root->fs_info->fs_devices->rotating = 1; - total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy); - btrfs_set_super_total_bytes(&root->fs_info->super_copy, + total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy); + btrfs_set_super_total_bytes(root->fs_info->super_copy, total_bytes + device->total_bytes); - total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy); - btrfs_set_super_num_devices(&root->fs_info->super_copy, + total_bytes = btrfs_super_num_devices(root->fs_info->super_copy); + btrfs_set_super_num_devices(root->fs_info->super_copy, total_bytes + 1); mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); @@ -1802,7 +1802,7 @@ static int __btrfs_grow_device(struct btrfs_trans_handle *trans, struct btrfs_device *device, u64 new_size) { struct btrfs_super_block *super_copy = - &device->dev_root->fs_info->super_copy; + device->dev_root->fs_info->super_copy; u64 old_total = btrfs_super_total_bytes(super_copy); u64 diff = new_size - device->total_bytes; @@ -1861,7 +1861,7 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans, static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64 chunk_offset) { - struct btrfs_super_block *super_copy = &root->fs_info->super_copy; + struct btrfs_super_block *super_copy = root->fs_info->super_copy; struct btrfs_disk_key *disk_key; struct btrfs_chunk *chunk; u8 *ptr; @@ -2187,7 +2187,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) bool retried = false; struct extent_buffer *l; struct btrfs_key key; - struct btrfs_super_block *super_copy = &root->fs_info->super_copy; + struct btrfs_super_block *super_copy = root->fs_info->super_copy; u64 old_total = btrfs_super_total_bytes(super_copy); u64 old_size = device->total_bytes; u64 diff = device->total_bytes - new_size; @@ -2311,7 +2311,7 @@ static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans, struct btrfs_key *key, struct btrfs_chunk *chunk, int item_size) { - struct btrfs_super_block *super_copy = &root->fs_info->super_copy; + struct btrfs_super_block *super_copy = root->fs_info->super_copy; struct btrfs_disk_key disk_key; u32 array_size; u8 *ptr; @@ -3653,7 +3653,7 @@ static int read_one_dev(struct btrfs_root *root, int btrfs_read_sys_array(struct btrfs_root *root) { - struct btrfs_super_block *super_copy = &root->fs_info->super_copy; + struct btrfs_super_block *super_copy = root->fs_info->super_copy; struct extent_buffer *sb; struct btrfs_disk_key *disk_key; struct btrfs_chunk *chunk; -- cgit v1.2.3 From 6d668dda0caec537fbf28c4d91e6d18181af3cff Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 3 Nov 2011 22:54:25 -0400 Subject: Btrfs: make a delayed_block_rsv for the delayed item insertion I've been hitting warnings in use_block_rsv when running the delayed insertion stuff. It's because we will readjust global block rsv based on what is in use, which means we could end up discarding reservations that are for the delayed insertion stuff. So instead create a seperate block rsv for the delayed insertion stuff. This will also make it easier to debug problems with the delayed insertion reservations since we will know that only the delayed insertion code touches this block_rsv. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 2 ++ fs/btrfs/delayed-inode.c | 14 +++++++------- fs/btrfs/disk-io.c | 1 + fs/btrfs/extent-tree.c | 3 +++ 4 files changed, 13 insertions(+), 7 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 78f43d1102a0..3002e5d4da0b 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -957,6 +957,8 @@ struct btrfs_fs_info { struct btrfs_block_rsv trans_block_rsv; /* block reservation for chunk tree */ struct btrfs_block_rsv chunk_block_rsv; + /* block reservation for delayed operations */ + struct btrfs_block_rsv delayed_block_rsv; struct btrfs_block_rsv empty_block_rsv; diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index b52c672f4c18..fc4026af7290 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -591,7 +591,7 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans, return 0; src_rsv = trans->block_rsv; - dst_rsv = &root->fs_info->global_block_rsv; + dst_rsv = &root->fs_info->delayed_block_rsv; num_bytes = btrfs_calc_trans_metadata_size(root, 1); ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); @@ -609,7 +609,7 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root, if (!item->bytes_reserved) return; - rsv = &root->fs_info->global_block_rsv; + rsv = &root->fs_info->delayed_block_rsv; btrfs_block_rsv_release(root, rsv, item->bytes_reserved); } @@ -628,7 +628,7 @@ static int btrfs_delayed_inode_reserve_metadata( return 0; src_rsv = trans->block_rsv; - dst_rsv = &root->fs_info->global_block_rsv; + dst_rsv = &root->fs_info->delayed_block_rsv; num_bytes = btrfs_calc_trans_metadata_size(root, 1); ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); @@ -646,7 +646,7 @@ static void btrfs_delayed_inode_release_metadata(struct btrfs_root *root, if (!node->bytes_reserved) return; - rsv = &root->fs_info->global_block_rsv; + rsv = &root->fs_info->delayed_block_rsv; btrfs_block_rsv_release(root, rsv, node->bytes_reserved); node->bytes_reserved = 0; @@ -1026,7 +1026,7 @@ int btrfs_run_delayed_items(struct btrfs_trans_handle *trans, path->leave_spinning = 1; block_rsv = trans->block_rsv; - trans->block_rsv = &root->fs_info->global_block_rsv; + trans->block_rsv = &root->fs_info->delayed_block_rsv; delayed_root = btrfs_get_delayed_root(root); @@ -1069,7 +1069,7 @@ static int __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, path->leave_spinning = 1; block_rsv = trans->block_rsv; - trans->block_rsv = &node->root->fs_info->global_block_rsv; + trans->block_rsv = &node->root->fs_info->delayed_block_rsv; ret = btrfs_insert_delayed_items(trans, path, node->root, node); if (!ret) @@ -1149,7 +1149,7 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work) goto free_path; block_rsv = trans->block_rsv; - trans->block_rsv = &root->fs_info->global_block_rsv; + trans->block_rsv = &root->fs_info->delayed_block_rsv; ret = btrfs_insert_delayed_items(trans, path, root, delayed_node); if (!ret) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index a61f8a6cf219..23b6776477b7 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1891,6 +1891,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, btrfs_init_block_rsv(&fs_info->trans_block_rsv); btrfs_init_block_rsv(&fs_info->chunk_block_rsv); btrfs_init_block_rsv(&fs_info->empty_block_rsv); + btrfs_init_block_rsv(&fs_info->delayed_block_rsv); atomic_set(&fs_info->nr_async_submits, 0); atomic_set(&fs_info->async_delalloc_pages, 0); atomic_set(&fs_info->async_submit_draining, 0); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 782eb3ea8edf..01c1f08b976a 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3914,6 +3914,7 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info) fs_info->delalloc_block_rsv.space_info = space_info; fs_info->trans_block_rsv.space_info = space_info; fs_info->empty_block_rsv.space_info = space_info; + fs_info->delayed_block_rsv.space_info = space_info; fs_info->extent_root->block_rsv = &fs_info->global_block_rsv; fs_info->csum_root->block_rsv = &fs_info->global_block_rsv; @@ -3933,6 +3934,8 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info) WARN_ON(fs_info->trans_block_rsv.reserved > 0); WARN_ON(fs_info->chunk_block_rsv.size > 0); WARN_ON(fs_info->chunk_block_rsv.reserved > 0); + WARN_ON(fs_info->delayed_block_rsv.size > 0); + WARN_ON(fs_info->delayed_block_rsv.reserved > 0); } void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, -- cgit v1.2.3 From 663350ac38c67ca388acea6e876dc6d668c232b0 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 3 Nov 2011 22:54:25 -0400 Subject: Btrfs: be smarter about committing the transaction in reserve_metadata_bytes Because of the overcommit stuff I had to make it so that we committed the transaction all the time in reserve_metadata_bytes in case we had overcommitted because of delayed items. This was because previously we had no way of knowing how much space was reserved for delayed items. Now that we have the delayed_block_rsv we can check it to see if committing the transaction would get us anywhere. This patch breaks out the committing logic into a helper function that will check to see if committing the transaction would free enough space for us to get anything done. With this patch xfstests 83 goes from taking 445 seconds to taking 28 seconds on my box. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 86 +++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 67 insertions(+), 19 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 01c1f08b976a..5b84205e7685 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3334,12 +3334,12 @@ out: /* * shrink metadata reservation for delalloc */ -static int shrink_delalloc(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 to_reclaim, +static int shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, bool wait_ordered) { struct btrfs_block_rsv *block_rsv; struct btrfs_space_info *space_info; + struct btrfs_trans_handle *trans; u64 reserved; u64 max_reclaim; u64 reclaimed = 0; @@ -3348,6 +3348,7 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, int loops = 0; unsigned long progress; + trans = (struct btrfs_trans_handle *)current->journal_info; block_rsv = &root->fs_info->delalloc_block_rsv; space_info = block_rsv->space_info; @@ -3417,6 +3418,60 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, return reclaimed >= to_reclaim; } +/** + * maybe_commit_transaction - possibly commit the transaction if its ok to + * @root - the root we're allocating for + * @bytes - the number of bytes we want to reserve + * @force - force the commit + * + * This will check to make sure that committing the transaction will actually + * get us somewhere and then commit the transaction if it does. Otherwise it + * will return -ENOSPC. + */ +static int may_commit_transaction(struct btrfs_root *root, + struct btrfs_space_info *space_info, + u64 bytes, int force) +{ + struct btrfs_block_rsv *delayed_rsv = &root->fs_info->delayed_block_rsv; + struct btrfs_trans_handle *trans; + + trans = (struct btrfs_trans_handle *)current->journal_info; + if (trans) + return -EAGAIN; + + if (force) + goto commit; + + /* See if there is enough pinned space to make this reservation */ + spin_lock(&space_info->lock); + if (space_info->bytes_pinned >= bytes) { + spin_unlock(&space_info->lock); + goto commit; + } + spin_unlock(&space_info->lock); + + /* + * See if there is some space in the delayed insertion reservation for + * this reservation. + */ + if (space_info != delayed_rsv->space_info) + return -ENOSPC; + + spin_lock(&delayed_rsv->lock); + if (delayed_rsv->size < bytes) { + spin_unlock(&delayed_rsv->lock); + return -ENOSPC; + } + spin_unlock(&delayed_rsv->lock); + +commit: + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) + return -ENOSPC; + + return btrfs_commit_transaction(trans, root); +} + /** * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space * @root - the root we're allocating for @@ -3436,7 +3491,6 @@ static int reserve_metadata_bytes(struct btrfs_root *root, u64 orig_bytes, int flush) { struct btrfs_space_info *space_info = block_rsv->space_info; - struct btrfs_trans_handle *trans; u64 used; u64 num_bytes = orig_bytes; int retries = 0; @@ -3445,7 +3499,6 @@ static int reserve_metadata_bytes(struct btrfs_root *root, bool flushing = false; bool wait_ordered = false; - trans = (struct btrfs_trans_handle *)current->journal_info; again: ret = 0; spin_lock(&space_info->lock); @@ -3461,7 +3514,7 @@ again: * deadlock since we are waiting for the flusher to finish, but * hold the current transaction open. */ - if (trans) + if (current->journal_info) return -EAGAIN; ret = wait_event_interruptible(space_info->wait, !space_info->flush); @@ -3517,12 +3570,16 @@ again: */ avail = (space_info->total_bytes - space_info->bytes_used) * 8; do_div(avail, 10); - if (space_info->bytes_pinned >= avail && flush && !trans && - !committed) { + if (space_info->bytes_pinned >= avail && flush && !committed) { space_info->flush = 1; flushing = true; spin_unlock(&space_info->lock); - goto commit; + ret = may_commit_transaction(root, space_info, + orig_bytes, 1); + if (ret) + goto out; + committed = true; + goto again; } spin_lock(&root->fs_info->free_chunk_lock); @@ -3575,7 +3632,7 @@ again: * We do synchronous shrinking since we don't actually unreserve * metadata until after the IO is completed. */ - ret = shrink_delalloc(trans, root, num_bytes, wait_ordered); + ret = shrink_delalloc(root, num_bytes, wait_ordered); if (ret < 0) goto out; @@ -3592,21 +3649,12 @@ again: goto again; } - ret = -EAGAIN; - if (trans) - goto out; - -commit: ret = -ENOSPC; if (committed) goto out; - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) - goto out; - ret = btrfs_commit_transaction(trans, root); + ret = may_commit_transaction(root, space_info, orig_bytes, 0); if (!ret) { - trans = NULL; committed = true; goto again; } -- cgit v1.2.3 From c06a0e120a4e381a1c291c1fce3c6155c5791cae Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 4 Nov 2011 19:56:02 -0400 Subject: Btrfs: fix delayed insertion reservation We all keep getting those stupid warnings from use_block_rsv when running stress.sh, and it's because the delayed insertion stuff is being stupid. It's not the delayed insertion stuffs fault, it's all just stupid. When marking an inode dirty for oh say updating the time on it, we just do a btrfs_join_transaction, which doesn't reserve any space. This is stupid because we're going to have to have space reserve to make this change, but we do it because it's fast because chances are we're going to call it over and over again and it doesn't matter. Well thanks to the delayed insertion stuff this is mostly the case, so we do actually need to make this reservation. So if trans->bytes_reserved is 0 then try to do a normal reservation. If not return ENOSPC which will make the btrfs_dirty_inode start a proper transaction which will let it do the whole ENOSPC dance and reserve enough space for the delayed insertion to steal the reservation from the transaction. The other stupid thing we do is not reserve space for the inode when writing to the thing. Usually this is ok since we have to update the time so we'd have already done all this work before we get to the endio stuff, so it doesn't matter. But this is stupid because we could write the data after the transaction commits where we changed the mtime of the inode so we have to cow all the way down to the inode anyway. This used to be masked by the delalloc reservation stuff, but because we delay the update it doesn't get masked in this case. So again the delayed insertion stuff bites us in the ass. So if our trans->block_rsv is delalloc, just steal the reservation from the delalloc reserve. Hopefully this won't bite us in the ass, but I've said that before. With this patch stress.sh no longer spits out those stupid warnings (famous last words). Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 3 +++ fs/btrfs/delayed-inode.c | 36 ++++++++++++++++++++++++++++-------- fs/btrfs/extent-tree.c | 18 ++++++++++++++++++ 3 files changed, 49 insertions(+), 8 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 3002e5d4da0b..6bb34fc1ff22 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2353,6 +2353,9 @@ void btrfs_free_block_rsv(struct btrfs_root *root, int btrfs_block_rsv_add(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, u64 num_bytes); +int btrfs_block_rsv_add_noflush(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 num_bytes); int btrfs_block_rsv_check(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, int min_factor); int btrfs_block_rsv_refill(struct btrfs_root *root, diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index fc4026af7290..bbe8496d5339 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -624,13 +624,36 @@ static int btrfs_delayed_inode_reserve_metadata( u64 num_bytes; int ret; - if (!trans->bytes_reserved) - return 0; - src_rsv = trans->block_rsv; dst_rsv = &root->fs_info->delayed_block_rsv; num_bytes = btrfs_calc_trans_metadata_size(root, 1); + + /* + * btrfs_dirty_inode will update the inode under btrfs_join_transaction + * which doesn't reserve space for speed. This is a problem since we + * still need to reserve space for this update, so try to reserve the + * space. + * + * Now if src_rsv == delalloc_block_rsv we'll let it just steal since + * we're accounted for. + */ + if (!trans->bytes_reserved && + src_rsv != &root->fs_info->delalloc_block_rsv) { + ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes); + /* + * Since we're under a transaction reserve_metadata_bytes could + * try to commit the transaction which will make it return + * EAGAIN to make us stop the transaction we have, so return + * ENOSPC instead so that btrfs_dirty_inode knows what to do. + */ + if (ret == -EAGAIN) + ret = -ENOSPC; + if (!ret) + node->bytes_reserved = num_bytes; + return ret; + } + ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); if (!ret) node->bytes_reserved = num_bytes; @@ -1686,11 +1709,8 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, } ret = btrfs_delayed_inode_reserve_metadata(trans, root, delayed_node); - /* - * we must reserve enough space when we start a new transaction, - * so reserving metadata failure is impossible - */ - BUG_ON(ret); + if (ret) + goto release_node; fill_stack_inode_item(trans, &delayed_node->inode_item, inode); delayed_node->inode_dirty = 1; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 5b84205e7685..23e936c3de76 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3814,6 +3814,24 @@ int btrfs_block_rsv_add(struct btrfs_root *root, return ret; } +int btrfs_block_rsv_add_noflush(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 num_bytes) +{ + int ret; + + if (num_bytes == 0) + return 0; + + ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 0); + if (!ret) { + block_rsv_add_bytes(block_rsv, num_bytes, 1); + return 0; + } + + return ret; +} + int btrfs_block_rsv_check(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, int min_factor) { -- cgit v1.2.3 From 7fd2ae21a42d178982679b86086661292b4afe4a Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 8 Nov 2011 15:47:34 -0500 Subject: Btrfs: fix our reservations for updating an inode when completing io People have been reporting ENOSPC crashes in finish_ordered_io. This is because we try to steal from the delalloc block rsv to satisfy a reservation to update the inode. The problem with this is we don't explicitly save space for updating the inode when doing delalloc. This is kind of a problem and we've gotten away with this because way back when we just stole from the delalloc reserve without any questions, and this worked out fine because generally speaking the leaf had been modified either by the mtime update when we did the original write or because we just updated the leaf when we inserted the file extent item, only on rare occasions had the leaf not actually been modified, and that was still ok because we'd just use a block or two out of the over-reservation that is delalloc. Then came the delayed inode stuff. This is amazing, except it wants a full reservation for updating the inode since it may do it at some point down the road after we've written the blocks and we have to recow everything again. This worked out because the delayed inode stuff just stole from the global reserve, that is until recently when I changed that because it caused other problems. So here we are, we're doing everything right and being screwed for it. So take an extra reservation for the inode at delalloc reservation time and carry it through the life of the delalloc reservation. If we need it we can steal it in the delayed inode stuff. If we have already stolen it try and do a normal metadata reservation. If that fails try to steal from the delalloc reservation. If _that_ fails we'll get a WARN_ON() so I can start thinking of a better way to solve this and in the meantime we'll steal from the global reserve. With this patch I ran xfstests 13 in a loop for a couple of hours and didn't see any problems. Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/btrfs_inode.h | 4 +-- fs/btrfs/delayed-inode.c | 65 +++++++++++++++++++++++++++++++++++++++++++++--- fs/btrfs/extent-tree.c | 22 +++++++++++++--- fs/btrfs/inode.c | 1 + 4 files changed, 83 insertions(+), 9 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 5a5d325a3935..634608d2a6d0 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -147,14 +147,12 @@ struct btrfs_inode { * the btrfs file release call will add this inode to the * ordered operations list so that we make sure to flush out any * new data the application may have written before commit. - * - * yes, its silly to have a single bitflag, but we might grow more - * of these. */ unsigned ordered_data_close:1; unsigned orphan_meta_reserved:1; unsigned dummy_inode:1; unsigned in_defrag:1; + unsigned delalloc_meta_reserved:1; /* * always compress this one file diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index bbe8496d5339..313ee14cf3b7 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -617,12 +617,14 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root, static int btrfs_delayed_inode_reserve_metadata( struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct inode *inode, struct btrfs_delayed_node *node) { struct btrfs_block_rsv *src_rsv; struct btrfs_block_rsv *dst_rsv; u64 num_bytes; int ret; + int release = false; src_rsv = trans->block_rsv; dst_rsv = &root->fs_info->delayed_block_rsv; @@ -652,11 +654,67 @@ static int btrfs_delayed_inode_reserve_metadata( if (!ret) node->bytes_reserved = num_bytes; return ret; + } else if (src_rsv == &root->fs_info->delalloc_block_rsv) { + spin_lock(&BTRFS_I(inode)->lock); + if (BTRFS_I(inode)->delalloc_meta_reserved) { + BTRFS_I(inode)->delalloc_meta_reserved = 0; + spin_unlock(&BTRFS_I(inode)->lock); + release = true; + goto migrate; + } + spin_unlock(&BTRFS_I(inode)->lock); + + /* Ok we didn't have space pre-reserved. This shouldn't happen + * too often but it can happen if we do delalloc to an existing + * inode which gets dirtied because of the time update, and then + * isn't touched again until after the transaction commits and + * then we try to write out the data. First try to be nice and + * reserve something strictly for us. If not be a pain and try + * to steal from the delalloc block rsv. + */ + ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes); + if (!ret) + goto out; + + ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); + if (!ret) + goto out; + + /* + * Ok this is a problem, let's just steal from the global rsv + * since this really shouldn't happen that often. + */ + WARN_ON(1); + ret = btrfs_block_rsv_migrate(&root->fs_info->global_block_rsv, + dst_rsv, num_bytes); + goto out; } +migrate: ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); - if (!ret) - node->bytes_reserved = num_bytes; + if (unlikely(ret)) { + /* This shouldn't happen */ + BUG_ON(release); + return ret; + } + +out: + /* + * Migrate only takes a reservation, it doesn't touch the size of the + * block_rsv. This is to simplify people who don't normally have things + * migrated from their block rsv. If they go to release their + * reservation, that will decrease the size as well, so if migrate + * reduced size we'd end up with a negative size. But for the + * delalloc_meta_reserved stuff we will only know to drop 1 reservation, + * but we could in fact do this reserve/migrate dance several times + * between the time we did the original reservation and we'd clean it + * up. So to take care of this, release the space for the meta + * reservation here. I think it may be time for a documentation page on + * how block rsvs. work. + */ + if (release) + btrfs_block_rsv_release(root, src_rsv, num_bytes); + node->bytes_reserved = num_bytes; return ret; } @@ -1708,7 +1766,8 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, goto release_node; } - ret = btrfs_delayed_inode_reserve_metadata(trans, root, delayed_node); + ret = btrfs_delayed_inode_reserve_metadata(trans, root, inode, + delayed_node); if (ret) goto release_node; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 18ea90c8943b..0b044e509e9f 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4063,23 +4063,30 @@ int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans, */ static unsigned drop_outstanding_extent(struct inode *inode) { + unsigned drop_inode_space = 0; unsigned dropped_extents = 0; BUG_ON(!BTRFS_I(inode)->outstanding_extents); BTRFS_I(inode)->outstanding_extents--; + if (BTRFS_I(inode)->outstanding_extents == 0 && + BTRFS_I(inode)->delalloc_meta_reserved) { + drop_inode_space = 1; + BTRFS_I(inode)->delalloc_meta_reserved = 0; + } + /* * If we have more or the same amount of outsanding extents than we have * reserved then we need to leave the reserved extents count alone. */ if (BTRFS_I(inode)->outstanding_extents >= BTRFS_I(inode)->reserved_extents) - return 0; + return drop_inode_space; dropped_extents = BTRFS_I(inode)->reserved_extents - BTRFS_I(inode)->outstanding_extents; BTRFS_I(inode)->reserved_extents -= dropped_extents; - return dropped_extents; + return dropped_extents + drop_inode_space; } /** @@ -4165,9 +4172,18 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) nr_extents = BTRFS_I(inode)->outstanding_extents - BTRFS_I(inode)->reserved_extents; BTRFS_I(inode)->reserved_extents += nr_extents; + } - to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents); + /* + * Add an item to reserve for updating the inode when we complete the + * delalloc io. + */ + if (!BTRFS_I(inode)->delalloc_meta_reserved) { + nr_extents++; + BTRFS_I(inode)->delalloc_meta_reserved = 1; } + + to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents); to_reserve += calc_csum_metadata_size(inode, num_bytes, 1); spin_unlock(&BTRFS_I(inode)->lock); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index f60e2490bd0d..2b920596c126 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6607,6 +6607,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->orphan_meta_reserved = 0; ei->dummy_inode = 0; ei->in_defrag = 0; + ei->delalloc_meta_reserved = 0; ei->force_compress = BTRFS_COMPRESS_NONE; ei->delayed_node = NULL; -- cgit v1.2.3 From 61b520a9d0083b9b361638e456af45fd75150c87 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 10 Nov 2011 20:45:05 -0500 Subject: Btrfs: Abstract similar code for btrfs_block_rsv_add{, _noflush} btrfs_block_rsv_add{, _noflush}() have similar code, so abstract that code. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 0b044e509e9f..0f47b3e2010e 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3796,16 +3796,16 @@ void btrfs_free_block_rsv(struct btrfs_root *root, kfree(rsv); } -int btrfs_block_rsv_add(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, - u64 num_bytes) +static inline int __block_rsv_add(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 num_bytes, int flush) { int ret; if (num_bytes == 0) return 0; - ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 1); + ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); if (!ret) { block_rsv_add_bytes(block_rsv, num_bytes, 1); return 0; @@ -3814,22 +3814,18 @@ int btrfs_block_rsv_add(struct btrfs_root *root, return ret; } +int btrfs_block_rsv_add(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 num_bytes) +{ + return __block_rsv_add(root, block_rsv, num_bytes, 1); +} + int btrfs_block_rsv_add_noflush(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, u64 num_bytes) { - int ret; - - if (num_bytes == 0) - return 0; - - ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 0); - if (!ret) { - block_rsv_add_bytes(block_rsv, num_bytes, 1); - return 0; - } - - return ret; + return __block_rsv_add(root, block_rsv, num_bytes, 0); } int btrfs_block_rsv_check(struct btrfs_root *root, -- cgit v1.2.3