From 5dc562c541e1026df9d43913c2f6b91156e22d32 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 17 Aug 2012 13:14:17 -0400 Subject: Btrfs: turbo charge fsync At least for the vm workload. Currently on fsync we will 1) Truncate all items in the log tree for the given inode if they exist and 2) Copy all items for a given inode into the log The problem with this is that for things like VMs you can have lots of extents from the fragmented writing behavior, and worst yet you may have only modified a few extents, not the entire thing. This patch fixes this problem by tracking which transid modified our extent, and then when we do the tree logging we find all of the extents we've modified in our current transaction, sort them and commit them. We also only truncate up to the xattrs of the inode and copy that stuff in normally, and then just drop any extents in the range we have that exist in the log already. Here are some numbers of a 50 meg fio job that does random writes and fsync()s after every write Original Patched SATA drive 82KB/s 140KB/s Fusion drive 431KB/s 2532KB/s So around 2-6 times faster depending on your hardware. There are a few corner cases, for example if you truncate at all we have to do it the old way since there is no way to be sure what is in the log is ok. This probably could be done smarter, but if you write-fsync-truncate-write-fsync you deserve what you get. All this work is in RAM of course so if your inode gets evicted from cache and you read it in and fsync it we'll do it the slow way if we are still in the same transaction that we last modified the inode in. The biggest cool part of this is that it requires no changes to the recovery code, so if you fsync with this patch and crash and load an old kernel, it will run the recovery and be a-ok. I have tested this pretty thoroughly with an fsync tester and everything comes back fine, as well as xfstests. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent_map.c | 34 ++++++++++++++++++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/extent_map.c') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 7c97b3301459..1fe82cfc1d93 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -35,6 +35,7 @@ void extent_map_exit(void) void extent_map_tree_init(struct extent_map_tree *tree) { tree->map = RB_ROOT; + INIT_LIST_HEAD(&tree->modified_extents); rwlock_init(&tree->lock); } @@ -54,7 +55,9 @@ struct extent_map *alloc_extent_map(void) em->in_tree = 0; em->flags = 0; em->compress_type = BTRFS_COMPRESS_NONE; + em->generation = 0; atomic_set(&em->refs, 1); + INIT_LIST_HEAD(&em->list); return em; } @@ -72,6 +75,7 @@ void free_extent_map(struct extent_map *em) WARN_ON(atomic_read(&em->refs) == 0); if (atomic_dec_and_test(&em->refs)) { WARN_ON(em->in_tree); + WARN_ON(!list_empty(&em->list)); kmem_cache_free(extent_map_cache, em); } } @@ -198,6 +202,12 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) em->block_len += merge->block_len; em->block_start = merge->block_start; merge->in_tree = 0; + if (merge->generation > em->generation) { + em->generation = merge->generation; + list_move(&em->list, &tree->modified_extents); + } + + list_del_init(&merge->list); rb_erase(&merge->rb_node, &tree->map); free_extent_map(merge); } @@ -211,11 +221,29 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) em->block_len += merge->len; rb_erase(&merge->rb_node, &tree->map); merge->in_tree = 0; + if (merge->generation > em->generation) { + em->generation = merge->generation; + list_move(&em->list, &tree->modified_extents); + } + list_del_init(&merge->list); free_extent_map(merge); } } -int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len) +/** + * unpint_extent_cache - unpin an extent from the cache + * @tree: tree to unpin the extent in + * @start: logical offset in the file + * @len: length of the extent + * @gen: generation that this extent has been modified in + * @prealloc: if this is set we need to clear the prealloc flag + * + * Called after an extent has been written to disk properly. Set the generation + * to the generation that actually added the file item to the inode so we know + * we need to sync this extent when we call fsync(). + */ +int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, + u64 gen) { int ret = 0; struct extent_map *em; @@ -228,10 +256,11 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len) if (!em) goto out; + list_move(&em->list, &tree->modified_extents); + em->generation = gen; clear_bit(EXTENT_FLAG_PINNED, &em->flags); try_merge_map(tree, em); - free_extent_map(em); out: write_unlock(&tree->lock); @@ -358,6 +387,7 @@ int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em) WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags)); rb_erase(&em->rb_node, &tree->map); + list_del_init(&em->list); em->in_tree = 0; return ret; } -- cgit v1.2.3 From 4e2f84e63dc138eca91e89ccbc34f37732ce58f7 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Mon, 27 Aug 2012 10:52:20 -0600 Subject: Btrfs: improve fsync by filtering extents that we want This is based on Josef's "Btrfs: turbo charge fsync". The above Josef's patch performs very good in random sync write test, because we won't have too much extents to merge. However, it does not performs good on the test: dd if=/dev/zero of=foobar bs=4k count=12500 oflag=sync The reason is when we do sequencial sync write, we need to merge the current extent just with the previous one, so that we can get accumulated extents to log: A(4k) --> AA(8k) --> AAA(12k) --> AAAA(16k) ... So we'll have to flush more and more checksum into log tree, which is the bottleneck according to my tests. But we can avoid this by telling fsync the real extents that are needed to be logged. With this, I did the above dd sync write test (size=50m), w/o (orig) w/ (josef's) w/ (this) SATA 104KB/s 109KB/s 121KB/s ramdisk 1.5MB/s 1.5MB/s 10.7MB/s (613%) Signed-off-by: Liu Bo --- fs/btrfs/extent_map.c | 20 ++++++++++++++++++++ fs/btrfs/extent_map.h | 2 ++ fs/btrfs/inode.c | 1 + fs/btrfs/tree-log.c | 6 +++--- 4 files changed, 26 insertions(+), 3 deletions(-) (limited to 'fs/btrfs/extent_map.c') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 1fe82cfc1d93..ac606f076eb7 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -203,6 +203,8 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) em->block_start = merge->block_start; merge->in_tree = 0; if (merge->generation > em->generation) { + em->mod_start = em->start; + em->mod_len = em->len; em->generation = merge->generation; list_move(&em->list, &tree->modified_extents); } @@ -222,6 +224,7 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) rb_erase(&merge->rb_node, &tree->map); merge->in_tree = 0; if (merge->generation > em->generation) { + em->mod_len = em->len; em->generation = merge->generation; list_move(&em->list, &tree->modified_extents); } @@ -247,6 +250,7 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, { int ret = 0; struct extent_map *em; + bool prealloc = false; write_lock(&tree->lock); em = lookup_extent_mapping(tree, start, len); @@ -259,8 +263,21 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, list_move(&em->list, &tree->modified_extents); em->generation = gen; clear_bit(EXTENT_FLAG_PINNED, &em->flags); + em->mod_start = em->start; + em->mod_len = em->len; + + if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { + prealloc = true; + clear_bit(EXTENT_FLAG_PREALLOC, &em->flags); + } try_merge_map(tree, em); + + if (prealloc) { + em->mod_start = em->start; + em->mod_len = em->len; + } + free_extent_map(em); out: write_unlock(&tree->lock); @@ -298,6 +315,9 @@ int add_extent_mapping(struct extent_map_tree *tree, } atomic_inc(&em->refs); + em->mod_start = em->start; + em->mod_len = em->len; + try_merge_map(tree, em); out: return ret; diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index 2388a60bd6e3..8e6294b51357 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -20,6 +20,8 @@ struct extent_map { /* all of these are in bytes */ u64 start; u64 len; + u64 mod_start; + u64 mod_len; u64 orig_start; u64 block_start; u64 block_len; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index ca4fa05171ab..878116d9625d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1308,6 +1308,7 @@ out_check: em->block_start = disk_bytenr; em->bdev = root->fs_info->fs_devices->latest_bdev; set_bit(EXTENT_FLAG_PINNED, &em->flags); + set_bit(EXTENT_FLAG_PREALLOC, &em->flags); while (1) { write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 58075d711d24..71e71539ffb7 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2833,8 +2833,8 @@ static int log_one_extent(struct btrfs_trans_handle *trans, struct btrfs_root *log = root->log_root; struct btrfs_file_extent_item *fi; struct btrfs_key key; - u64 start = em->start; - u64 len = em->len; + u64 start = em->mod_start; + u64 len = em->mod_len; u64 num_bytes; int nritems; int ret; @@ -2970,7 +2970,7 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, * sequential then we need to copy the items we have and redo * our search */ - if (args.nr && em->start != args.next_offset) { + if (args.nr && em->mod_start != args.next_offset) { ret = copy_items(trans, log, dst_path, args.src, args.start_slot, args.nr, LOG_INODE_ALL); -- cgit v1.2.3 From 837e197283199de640857192ca32767cb6e24fe8 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 7 Sep 2012 03:00:48 -0600 Subject: btrfs: polish names of kmem caches Usecase: watch 'grep btrfs < /proc/slabinfo' easy to watch all caches in one go. Signed-off-by: David Sterba --- fs/btrfs/delayed-inode.c | 2 +- fs/btrfs/extent_io.c | 4 ++-- fs/btrfs/extent_map.c | 2 +- fs/btrfs/inode.c | 10 +++++----- 4 files changed, 9 insertions(+), 9 deletions(-) (limited to 'fs/btrfs/extent_map.c') diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index eb768c4c1358..b26d2f9d88d7 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -29,7 +29,7 @@ static struct kmem_cache *delayed_node_cache; int __init btrfs_delayed_inode_init(void) { - delayed_node_cache = kmem_cache_create("delayed_node", + delayed_node_cache = kmem_cache_create("btrfs_delayed_node", sizeof(struct btrfs_delayed_node), 0, SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 4a41b17295dc..3ad84f500687 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -64,13 +64,13 @@ tree_fs_info(struct extent_io_tree *tree) int __init extent_io_init(void) { - extent_state_cache = kmem_cache_create("extent_state", + extent_state_cache = kmem_cache_create("btrfs_extent_state", sizeof(struct extent_state), 0, SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); if (!extent_state_cache) return -ENOMEM; - extent_buffer_cache = kmem_cache_create("extent_buffers", + extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer", sizeof(struct extent_buffer), 0, SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); if (!extent_buffer_cache) diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index ac606f076eb7..8d1364d385d6 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -11,7 +11,7 @@ static struct kmem_cache *extent_map_cache; int __init extent_map_init(void) { - extent_map_cache = kmem_cache_create("extent_map", + extent_map_cache = kmem_cache_create("btrfs_extent_map", sizeof(struct extent_map), 0, SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); if (!extent_map_cache) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 9b21cf97cdd5..aae55625aa59 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7155,31 +7155,31 @@ void btrfs_destroy_cachep(void) int btrfs_init_cachep(void) { - btrfs_inode_cachep = kmem_cache_create("btrfs_inode_cache", + btrfs_inode_cachep = kmem_cache_create("btrfs_inode", sizeof(struct btrfs_inode), 0, SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, init_once); if (!btrfs_inode_cachep) goto fail; - btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle_cache", + btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle", sizeof(struct btrfs_trans_handle), 0, SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); if (!btrfs_trans_handle_cachep) goto fail; - btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction_cache", + btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction", sizeof(struct btrfs_transaction), 0, SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); if (!btrfs_transaction_cachep) goto fail; - btrfs_path_cachep = kmem_cache_create("btrfs_path_cache", + btrfs_path_cachep = kmem_cache_create("btrfs_path", sizeof(struct btrfs_path), 0, SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); if (!btrfs_path_cachep) goto fail; - btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space_cache", + btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space", sizeof(struct btrfs_free_space), 0, SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); if (!btrfs_free_space_cachep) -- cgit v1.2.3 From ff44c6e36dc9dcc02652a1105b120bdf08cea9f7 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 14 Sep 2012 12:59:20 -0400 Subject: Btrfs: do not hold the write_lock on the extent tree while logging Dave Sterba pointed out a sleeping while atomic bug while doing fsync. This is because I'm an idiot and didn't realize that rwlock's were spin locks, so we've been holding this thing while doing allocations and such which is not good. This patch fixes this by dropping the write lock before we do anything heavy and re-acquire it when it is done. We also need to take a ref on the em's in case their corresponding pages are evicted and mark them as being logged so that releasepage does not remove them and doesn't remove them from our local list. Thanks, Reported-by: Dave Sterba Signed-off-by: Josef Bacik --- fs/btrfs/extent_map.c | 3 ++- fs/btrfs/extent_map.h | 1 + fs/btrfs/tree-log.c | 21 +++++++++++++++++---- 3 files changed, 20 insertions(+), 5 deletions(-) (limited to 'fs/btrfs/extent_map.c') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 8d1364d385d6..b8cbc8d5c7f7 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -407,7 +407,8 @@ int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em) WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags)); rb_erase(&em->rb_node, &tree->map); - list_del_init(&em->list); + if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags)) + list_del_init(&em->list); em->in_tree = 0; return ret; } diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index 8e6294b51357..679225555f7b 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -13,6 +13,7 @@ #define EXTENT_FLAG_COMPRESSED 1 #define EXTENT_FLAG_VACANCY 2 /* no file extent item found */ #define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */ +#define EXTENT_FLAG_LOGGING 4 /* Logging this extent */ struct extent_map { struct rb_node rb_node; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 038a5229404a..ed1f7ce7219a 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2945,6 +2945,9 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, list_del_init(&em->list); if (em->generation <= test_gen) continue; + /* Need a ref to keep it from getting evicted from cache */ + atomic_inc(&em->refs); + set_bit(EXTENT_FLAG_LOGGING, &em->flags); list_add_tail(&em->list, &extents); } @@ -2954,13 +2957,18 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, em = list_entry(extents.next, struct extent_map, list); list_del_init(&em->list); + clear_bit(EXTENT_FLAG_LOGGING, &em->flags); /* * If we had an error we just need to delete everybody from our * private list. */ - if (ret) + if (ret) { + free_extent_map(em); continue; + } + + write_unlock(&tree->lock); /* * If the previous EM and the last extent we left off on aren't @@ -2971,21 +2979,26 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, ret = copy_items(trans, inode, dst_path, args.src, args.start_slot, args.nr, LOG_INODE_ALL); - if (ret) + if (ret) { + free_extent_map(em); + write_lock(&tree->lock); continue; + } btrfs_release_path(path); args.nr = 0; } ret = log_one_extent(trans, inode, root, em, path, dst_path, &args); + free_extent_map(em); + write_lock(&tree->lock); } + WARN_ON(!list_empty(&extents)); + write_unlock(&tree->lock); if (!ret && args.nr) ret = copy_items(trans, inode, dst_path, args.src, args.start_slot, args.nr, LOG_INODE_ALL); btrfs_release_path(path); - WARN_ON(!list_empty(&extents)); - write_unlock(&tree->lock); return ret; } -- cgit v1.2.3