diff options
author | Stephen Rothwell <sfr@canb.auug.org.au> | 2017-12-01 10:05:14 +1100 |
---|---|---|
committer | Stephen Rothwell <sfr@canb.auug.org.au> | 2017-12-01 10:05:14 +1100 |
commit | d795ad0b06209ff30fb79e34c75363a025d3f617 (patch) | |
tree | cdf4e01852598b9bf2c6476ab848880479e674e4 /fs | |
parent | f41693dc821565ee610a9346399494e7556a02cf (diff) | |
parent | 930a0ad9223a2a944023ac8cd15f0e866a7f75e6 (diff) |
Merge remote-tracking branch 'btrfs-kdave/for-next'
Diffstat (limited to 'fs')
36 files changed, 1258 insertions, 873 deletions
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 5982c8a71f02..06ef50712acd 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -1522,12 +1522,23 @@ out: unsigned int btrfs_compress_str2level(const char *str) { - if (strncmp(str, "zlib", 4) != 0) + long level; + int max; + + if (strncmp(str, "zlib", 4) == 0) + max = 9; + else if (strncmp(str, "zstd", 4) == 0) + max = 15; // encoded on 4 bits, real max is 22 + else return 0; - /* Accepted form: zlib:1 up to zlib:9 and nothing left after the number */ - if (str[4] == ':' && '1' <= str[5] && str[5] <= '9' && str[6] == 0) - return str[5] - '0'; + /* Accepted form: zxxx:1 up to zxxx:9 and nothing left after the number */ + str += 4; + if (*str == ':') + str++; + + if (kstrtoul(str, 10, &level)) + return BTRFS_ZLIB_DEFAULT_LEVEL; - return BTRFS_ZLIB_DEFAULT_LEVEL; + return (level > max) ? BTRFS_ZLIB_DEFAULT_LEVEL : level; } diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 0868cc554f14..6b692903a23c 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -75,7 +75,7 @@ struct compressed_bio { u32 sums; }; -void btrfs_init_compress(void); +void __init btrfs_init_compress(void); void btrfs_exit_compress(void); int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping, diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 531e0a8645b0..1e74cf826532 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1032,14 +1032,17 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) { ret = btrfs_inc_ref(trans, root, buf, 1); - BUG_ON(ret); /* -ENOMEM */ + if (ret) + return ret; if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { ret = btrfs_dec_ref(trans, root, buf, 0); - BUG_ON(ret); /* -ENOMEM */ + if (ret) + return ret; ret = btrfs_inc_ref(trans, root, cow, 1); - BUG_ON(ret); /* -ENOMEM */ + if (ret) + return ret; } new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; } else { @@ -1049,7 +1052,8 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, ret = btrfs_inc_ref(trans, root, cow, 1); else ret = btrfs_inc_ref(trans, root, cow, 0); - BUG_ON(ret); /* -ENOMEM */ + if (ret) + return ret; } if (new_flags != 0) { int level = btrfs_header_level(buf); @@ -1068,9 +1072,11 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, ret = btrfs_inc_ref(trans, root, cow, 1); else ret = btrfs_inc_ref(trans, root, cow, 0); - BUG_ON(ret); /* -ENOMEM */ + if (ret) + return ret; ret = btrfs_dec_ref(trans, root, buf, 1); - BUG_ON(ret); /* -ENOMEM */ + if (ret) + return ret; } clean_tree_block(fs_info, buf); *last_ref = 1; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 13c260b525a1..09b72b6996ce 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3060,15 +3060,10 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, struct btrfs_path *path, u64 dir, const char *name, u16 name_len, int mod); -int verify_dir_item(struct btrfs_fs_info *fs_info, - struct extent_buffer *leaf, int slot, - struct btrfs_dir_item *dir_item); struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info, struct btrfs_path *path, const char *name, int name_len); -bool btrfs_is_name_len_valid(struct extent_buffer *leaf, int slot, - unsigned long start, u16 name_len); /* orphan.c */ int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, @@ -3197,7 +3192,7 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc); struct inode *btrfs_alloc_inode(struct super_block *sb); void btrfs_destroy_inode(struct inode *inode); int btrfs_drop_inode(struct inode *inode); -int btrfs_init_cachep(void); +int __init btrfs_init_cachep(void); void btrfs_destroy_cachep(void); long btrfs_ioctl_trans_end(struct file *file); struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, @@ -3248,7 +3243,7 @@ ssize_t btrfs_dedupe_file_range(struct file *src_file, u64 loff, u64 olen, struct file *dst_file, u64 dst_loff); /* file.c */ -int btrfs_auto_defrag_init(void); +int __init btrfs_auto_defrag_init(void); void btrfs_auto_defrag_exit(void); int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, struct btrfs_inode *inode); @@ -3283,7 +3278,7 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, struct btrfs_root *root); /* sysfs.c */ -int btrfs_init_sysfs(void); +int __init btrfs_init_sysfs(void); void btrfs_exit_sysfs(void); int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info); void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info); diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 5d73f79ded8b..f8e59ac4967e 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1279,40 +1279,42 @@ static void btrfs_async_run_delayed_root(struct btrfs_work *work) if (!path) goto out; -again: - if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND / 2) - goto free_path; + do { + if (atomic_read(&delayed_root->items) < + BTRFS_DELAYED_BACKGROUND / 2) + break; - delayed_node = btrfs_first_prepared_delayed_node(delayed_root); - if (!delayed_node) - goto free_path; + delayed_node = btrfs_first_prepared_delayed_node(delayed_root); + if (!delayed_node) + break; - path->leave_spinning = 1; - root = delayed_node->root; + path->leave_spinning = 1; + root = delayed_node->root; - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) - goto release_path; + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + btrfs_release_path(path); + btrfs_release_prepared_delayed_node(delayed_node); + total_done++; + continue; + } - block_rsv = trans->block_rsv; - trans->block_rsv = &root->fs_info->delayed_block_rsv; + block_rsv = trans->block_rsv; + trans->block_rsv = &root->fs_info->delayed_block_rsv; - __btrfs_commit_inode_delayed_items(trans, path, delayed_node); + __btrfs_commit_inode_delayed_items(trans, path, delayed_node); - trans->block_rsv = block_rsv; - btrfs_end_transaction(trans); - btrfs_btree_balance_dirty_nodelay(root->fs_info); + trans->block_rsv = block_rsv; + btrfs_end_transaction(trans); + btrfs_btree_balance_dirty_nodelay(root->fs_info); -release_path: - btrfs_release_path(path); - total_done++; + btrfs_release_path(path); + btrfs_release_prepared_delayed_node(delayed_node); + total_done++; - btrfs_release_prepared_delayed_node(delayed_node); - if ((async_work->nr == 0 && total_done < BTRFS_DELAYED_WRITEBACK) || - total_done < async_work->nr) - goto again; + } while ((async_work->nr == 0 && total_done < BTRFS_DELAYED_WRITEBACK) + || total_done < async_work->nr); -free_path: btrfs_free_path(path); out: wake_up(&delayed_root->wait); @@ -1325,10 +1327,6 @@ static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root, { struct btrfs_async_delayed_work *async_work; - if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND || - btrfs_workqueue_normal_congested(fs_info->delayed_workers)) - return 0; - async_work = kmalloc(sizeof(*async_work), GFP_NOFS); if (!async_work) return -ENOMEM; @@ -1364,7 +1362,8 @@ void btrfs_balance_delayed_items(struct btrfs_fs_info *fs_info) { struct btrfs_delayed_root *delayed_root = fs_info->delayed_root; - if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND) + if ((atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND) || + btrfs_workqueue_normal_congested(fs_info->delayed_workers)) return; if (atomic_read(&delayed_root->items) >= BTRFS_DELAYED_WRITEBACK) { diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 83be8f9fd906..a1a40cf382e3 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -937,7 +937,7 @@ void btrfs_delayed_ref_exit(void) kmem_cache_destroy(btrfs_delayed_extent_op_cachep); } -int btrfs_delayed_ref_init(void) +int __init btrfs_delayed_ref_init(void) { btrfs_delayed_ref_head_cachep = kmem_cache_create( "btrfs_delayed_ref_head", diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index a43af432f859..c4f625e5a691 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -203,7 +203,7 @@ extern struct kmem_cache *btrfs_delayed_tree_ref_cachep; extern struct kmem_cache *btrfs_delayed_data_ref_cachep; extern struct kmem_cache *btrfs_delayed_extent_op_cachep; -int btrfs_delayed_ref_init(void); +int __init btrfs_delayed_ref_init(void); void btrfs_delayed_ref_exit(void); static inline struct btrfs_delayed_extent_op * diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index 41cb9196eaa8..cbe421605cd5 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -403,8 +403,6 @@ struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info, btrfs_dir_data_len(leaf, dir_item); name_ptr = (unsigned long)(dir_item + 1); - if (verify_dir_item(fs_info, leaf, path->slots[0], dir_item)) - return NULL; if (btrfs_dir_name_len(leaf, dir_item) == name_len && memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0) return dir_item; @@ -450,109 +448,3 @@ int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans, } return ret; } - -int verify_dir_item(struct btrfs_fs_info *fs_info, - struct extent_buffer *leaf, - int slot, - struct btrfs_dir_item *dir_item) -{ - u16 namelen = BTRFS_NAME_LEN; - int ret; - u8 type = btrfs_dir_type(leaf, dir_item); - - if (type >= BTRFS_FT_MAX) { - btrfs_crit(fs_info, "invalid dir item type: %d", (int)type); - return 1; - } - - if (type == BTRFS_FT_XATTR) - namelen = XATTR_NAME_MAX; - - if (btrfs_dir_name_len(leaf, dir_item) > namelen) { - btrfs_crit(fs_info, "invalid dir item name len: %u", - (unsigned)btrfs_dir_name_len(leaf, dir_item)); - return 1; - } - - namelen = btrfs_dir_name_len(leaf, dir_item); - ret = btrfs_is_name_len_valid(leaf, slot, - (unsigned long)(dir_item + 1), namelen); - if (!ret) - return 1; - - /* BTRFS_MAX_XATTR_SIZE is the same for all dir items */ - if ((btrfs_dir_data_len(leaf, dir_item) + - btrfs_dir_name_len(leaf, dir_item)) > - BTRFS_MAX_XATTR_SIZE(fs_info)) { - btrfs_crit(fs_info, "invalid dir item name + data len: %u + %u", - (unsigned)btrfs_dir_name_len(leaf, dir_item), - (unsigned)btrfs_dir_data_len(leaf, dir_item)); - return 1; - } - - return 0; -} - -bool btrfs_is_name_len_valid(struct extent_buffer *leaf, int slot, - unsigned long start, u16 name_len) -{ - struct btrfs_fs_info *fs_info = leaf->fs_info; - struct btrfs_key key; - u32 read_start; - u32 read_end; - u32 item_start; - u32 item_end; - u32 size; - bool ret = true; - - ASSERT(start > BTRFS_LEAF_DATA_OFFSET); - - read_start = start - BTRFS_LEAF_DATA_OFFSET; - read_end = read_start + name_len; - item_start = btrfs_item_offset_nr(leaf, slot); - item_end = btrfs_item_end_nr(leaf, slot); - - btrfs_item_key_to_cpu(leaf, &key, slot); - - switch (key.type) { - case BTRFS_DIR_ITEM_KEY: - case BTRFS_XATTR_ITEM_KEY: - case BTRFS_DIR_INDEX_KEY: - size = sizeof(struct btrfs_dir_item); - break; - case BTRFS_INODE_REF_KEY: - size = sizeof(struct btrfs_inode_ref); - break; - case BTRFS_INODE_EXTREF_KEY: - size = sizeof(struct btrfs_inode_extref); - break; - case BTRFS_ROOT_REF_KEY: - case BTRFS_ROOT_BACKREF_KEY: - size = sizeof(struct btrfs_root_ref); - break; - default: - ret = false; - goto out; - } - - if (read_start < item_start) { - ret = false; - goto out; - } - if (read_end > item_end) { - ret = false; - goto out; - } - - /* there shall be item(s) before name */ - if (read_start - item_start < size) { - ret = false; - goto out; - } - -out: - if (!ret) - btrfs_crit(fs_info, "invalid dir item name len: %u", - (unsigned int)name_len); - return ret; -} diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 10a2a579cc7f..82c96607fc46 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -220,7 +220,7 @@ void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb, * extents on the btree inode are pretty simple, there's one extent * that covers the entire device */ -static struct extent_map *btree_get_extent(struct btrfs_inode *inode, +struct extent_map *btree_get_extent(struct btrfs_inode *inode, struct page *page, size_t pg_offset, u64 start, u64 len, int create) { @@ -285,7 +285,7 @@ static int csum_tree_block(struct btrfs_fs_info *fs_info, int verify) { u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); - char *result = NULL; + char result[BTRFS_CSUM_SIZE]; unsigned long len; unsigned long cur_len; unsigned long offset = BTRFS_CSUM_SIZE; @@ -294,7 +294,6 @@ static int csum_tree_block(struct btrfs_fs_info *fs_info, unsigned long map_len; int err; u32 crc = ~(u32)0; - unsigned long inline_result; len = buf->len - offset; while (len > 0) { @@ -308,13 +307,7 @@ static int csum_tree_block(struct btrfs_fs_info *fs_info, len -= cur_len; offset += cur_len; } - if (csum_size > sizeof(inline_result)) { - result = kzalloc(csum_size, GFP_NOFS); - if (!result) - return -ENOMEM; - } else { - result = (char *)&inline_result; - } + memset(result, 0, BTRFS_CSUM_SIZE); btrfs_csum_final(crc, result); @@ -329,15 +322,12 @@ static int csum_tree_block(struct btrfs_fs_info *fs_info, "%s checksum verify failed on %llu wanted %X found %X level %d", fs_info->sb->s_id, buf->start, val, found, btrfs_header_level(buf)); - if (result != (char *)&inline_result) - kfree(result); return -EUCLEAN; } } else { write_extent_buffer(buf, result, 0, csum_size); } - if (result != (char *)&inline_result) - kfree(result); + return 0; } @@ -455,7 +445,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_fs_info *fs_info, io_tree = &BTRFS_I(fs_info->btree_inode)->io_tree; while (1) { ret = read_extent_buffer_pages(io_tree, eb, WAIT_COMPLETE, - btree_get_extent, mirror_num); + mirror_num); if (!ret) { if (!verify_parent_transid(io_tree, eb, parent_transid, 0)) @@ -1012,7 +1002,7 @@ void readahead_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr) if (IS_ERR(buf)) return; read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree, - buf, WAIT_NONE, btree_get_extent, 0); + buf, WAIT_NONE, 0); free_extent_buffer(buf); } @@ -1031,7 +1021,7 @@ int reada_tree_block_flagged(struct btrfs_fs_info *fs_info, u64 bytenr, set_bit(EXTENT_BUFFER_READAHEAD, &buf->bflags); ret = read_extent_buffer_pages(io_tree, buf, WAIT_PAGE_LOCK, - btree_get_extent, mirror_num); + mirror_num); if (ret) { free_extent_buffer(buf); return ret; @@ -1243,7 +1233,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, struct btrfs_root *root; struct btrfs_key key; int ret = 0; - uuid_le uuid; + uuid_le uuid = { 0 }; root = btrfs_alloc_root(fs_info, GFP_KERNEL); if (!root) @@ -1284,7 +1274,8 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, btrfs_set_root_used(&root->root_item, leaf->len); btrfs_set_root_last_snapshot(&root->root_item, 0); btrfs_set_root_dirid(&root->root_item, 0); - uuid_le_gen(&uuid); + if (is_fstree(objectid)) + uuid_le_gen(&uuid); memcpy(root->root_item.uuid, uuid.b, BTRFS_UUID_SIZE); root->root_item.drop_level = 0; @@ -3396,9 +3387,10 @@ static int barrier_all_devices(struct btrfs_fs_info *info) int errors_wait = 0; blk_status_t ret; + lockdep_assert_held(&info->fs_devices->device_list_mutex); /* send down all the barriers */ head = &info->fs_devices->devices; - list_for_each_entry_rcu(dev, head, dev_list) { + list_for_each_entry(dev, head, dev_list) { if (dev->missing) continue; if (!dev->bdev) @@ -3411,7 +3403,7 @@ static int barrier_all_devices(struct btrfs_fs_info *info) } /* wait for all the barriers */ - list_for_each_entry_rcu(dev, head, dev_list) { + list_for_each_entry(dev, head, dev_list) { if (dev->missing) continue; if (!dev->bdev) { @@ -3510,7 +3502,7 @@ int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors) } } - list_for_each_entry_rcu(dev, head, dev_list) { + list_for_each_entry(dev, head, dev_list) { if (!dev->bdev) { total_errors++; continue; @@ -3551,7 +3543,7 @@ int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors) } total_errors = 0; - list_for_each_entry_rcu(dev, head, dev_list) { + list_for_each_entry(dev, head, dev_list) { if (!dev->bdev) continue; if (!dev->in_fs_metadata || !dev->writeable) diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 7f7c35d6347a..301151a50ac1 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -149,6 +149,9 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, u64 objectid); int btree_lock_page_hook(struct page *page, void *data, void (*flush_fn)(void *)); +struct extent_map *btree_get_extent(struct btrfs_inode *inode, + struct page *page, size_t pg_offset, u64 start, u64 len, + int create); int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags); int __init btrfs_end_io_wq_init(void); void btrfs_end_io_wq_exit(void); diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index 3aeb5770f896..ddaccad469f8 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -283,11 +283,6 @@ static int btrfs_get_name(struct dentry *parent, char *name, name_len = btrfs_inode_ref_name_len(leaf, iref); } - ret = btrfs_is_name_len_valid(leaf, path->slots[0], name_ptr, name_len); - if (!ret) { - btrfs_free_path(path); - return -EIO; - } read_extent_buffer(leaf, name, name_ptr, name_len); btrfs_free_path(path); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 4497f937e8fb..fa78da608ff2 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2894,7 +2894,7 @@ int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_block_rsv *global_rsv; u64 num_heads = trans->transaction->delayed_refs.num_heads_ready; u64 csum_bytes = trans->transaction->delayed_refs.pending_csums; - u64 num_dirty_bgs = trans->transaction->num_dirty_bgs; + unsigned int num_dirty_bgs = trans->transaction->num_dirty_bgs; u64 num_bytes, num_dirty_bgs_bytes; int ret = 0; @@ -4945,12 +4945,12 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info, bytes = 0; else bytes -= delayed_rsv->size; + spin_unlock(&delayed_rsv->lock); + if (percpu_counter_compare(&space_info->total_bytes_pinned, bytes) < 0) { - spin_unlock(&delayed_rsv->lock); return -ENOSPC; } - spin_unlock(&delayed_rsv->lock); commit: trans = btrfs_join_transaction(fs_info->extent_root); @@ -5738,8 +5738,8 @@ int btrfs_block_rsv_refill(struct btrfs_root *root, * or return if we already have enough space. This will also handle the resreve * tracepoint for the reserved amount. */ -int btrfs_inode_rsv_refill(struct btrfs_inode *inode, - enum btrfs_reserve_flush_enum flush) +static int btrfs_inode_rsv_refill(struct btrfs_inode *inode, + enum btrfs_reserve_flush_enum flush) { struct btrfs_root *root = inode->root; struct btrfs_block_rsv *block_rsv = &inode->block_rsv; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 012d63870b99..ef72efef8b39 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -21,6 +21,7 @@ #include "locking.h" #include "rcu-string.h" #include "backref.h" +#include "disk-io.h" static struct kmem_cache *extent_state_cache; static struct kmem_cache *extent_buffer_cache; @@ -109,8 +110,6 @@ struct tree_entry { struct extent_page_data { struct bio *bio; struct extent_io_tree *tree; - get_extent_t *get_extent; - /* tells writepage not to lock the state bits for this range * it still does the unlocking */ @@ -581,7 +580,7 @@ static void extent_io_tree_panic(struct extent_io_tree *tree, int err) * * This takes the tree lock, and returns 0 on success and < 0 on error. */ -static int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, +int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, int wake, int delete, struct extent_state **cached_state, gfp_t mask, struct extent_changeset *changeset) @@ -1295,10 +1294,10 @@ int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, int wake, int delete, - struct extent_state **cached, gfp_t mask) + struct extent_state **cached) { return __clear_extent_bit(tree, start, end, bits, wake, delete, - cached, mask, NULL); + cached, GFP_NOFS, NULL); } int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, @@ -1348,7 +1347,7 @@ int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end) if (err == -EEXIST) { if (failed_start > start) clear_extent_bit(tree, start, failed_start - 1, - EXTENT_LOCKED, 1, 0, NULL, GFP_NOFS); + EXTENT_LOCKED, 1, 0, NULL); return 0; } return 1; @@ -1744,7 +1743,7 @@ void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, unsigned long page_ops) { clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, clear_bits, 1, 0, - NULL, GFP_NOFS); + NULL); __process_pages_contig(inode->i_mapping, locked_page, start >> PAGE_SHIFT, end >> PAGE_SHIFT, @@ -3092,9 +3091,8 @@ out: static inline void __do_contiguous_readpages(struct extent_io_tree *tree, struct page *pages[], int nr_pages, u64 start, u64 end, - get_extent_t *get_extent, struct extent_map **em_cached, - struct bio **bio, int mirror_num, + struct bio **bio, unsigned long *bio_flags, u64 *prev_em_start) { @@ -3115,18 +3113,17 @@ static inline void __do_contiguous_readpages(struct extent_io_tree *tree, } for (index = 0; index < nr_pages; index++) { - __do_readpage(tree, pages[index], get_extent, em_cached, bio, - mirror_num, bio_flags, 0, prev_em_start); + __do_readpage(tree, pages[index], btrfs_get_extent, em_cached, + bio, 0, bio_flags, 0, prev_em_start); put_page(pages[index]); } } static void __extent_readpages(struct extent_io_tree *tree, struct page *pages[], - int nr_pages, get_extent_t *get_extent, + int nr_pages, struct extent_map **em_cached, - struct bio **bio, int mirror_num, - unsigned long *bio_flags, + struct bio **bio, unsigned long *bio_flags, u64 *prev_em_start) { u64 start = 0; @@ -3146,8 +3143,8 @@ static void __extent_readpages(struct extent_io_tree *tree, } else { __do_contiguous_readpages(tree, &pages[first_index], index - first_index, start, - end, get_extent, em_cached, - bio, mirror_num, bio_flags, + end, em_cached, + bio, bio_flags, prev_em_start); start = page_start; end = start + PAGE_SIZE - 1; @@ -3158,9 +3155,8 @@ static void __extent_readpages(struct extent_io_tree *tree, if (end) __do_contiguous_readpages(tree, &pages[first_index], index - first_index, start, - end, get_extent, em_cached, bio, - mirror_num, bio_flags, - prev_em_start); + end, em_cached, bio, + bio_flags, prev_em_start); } static int __extent_read_full_page(struct extent_io_tree *tree, @@ -3375,7 +3371,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, page_end, NULL, 1); break; } - em = epd->get_extent(BTRFS_I(inode), page, pg_offset, cur, + em = btrfs_get_extent(BTRFS_I(inode), page, pg_offset, cur, end - cur + 1, 1); if (IS_ERR_OR_NULL(em)) { SetPageError(page); @@ -4060,14 +4056,12 @@ static noinline void flush_write_bio(void *data) } int extent_write_full_page(struct extent_io_tree *tree, struct page *page, - get_extent_t *get_extent, struct writeback_control *wbc) { int ret; struct extent_page_data epd = { .bio = NULL, .tree = tree, - .get_extent = get_extent, .extent_locked = 0, .sync_io = wbc->sync_mode == WB_SYNC_ALL, }; @@ -4079,8 +4073,7 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page, } int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode, - u64 start, u64 end, get_extent_t *get_extent, - int mode) + u64 start, u64 end, int mode) { int ret = 0; struct address_space *mapping = inode->i_mapping; @@ -4091,7 +4084,6 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode, struct extent_page_data epd = { .bio = NULL, .tree = tree, - .get_extent = get_extent, .extent_locked = 1, .sync_io = mode == WB_SYNC_ALL, }; @@ -4123,14 +4115,12 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode, int extent_writepages(struct extent_io_tree *tree, struct address_space *mapping, - get_extent_t *get_extent, struct writeback_control *wbc) { int ret = 0; struct extent_page_data epd = { .bio = NULL, .tree = tree, - .get_extent = get_extent, .extent_locked = 0, .sync_io = wbc->sync_mode == WB_SYNC_ALL, }; @@ -4143,8 +4133,7 @@ int extent_writepages(struct extent_io_tree *tree, int extent_readpages(struct extent_io_tree *tree, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages, - get_extent_t get_extent) + struct list_head *pages, unsigned nr_pages) { struct bio *bio = NULL; unsigned page_idx; @@ -4170,13 +4159,13 @@ int extent_readpages(struct extent_io_tree *tree, pagepool[nr++] = page; if (nr < ARRAY_SIZE(pagepool)) continue; - __extent_readpages(tree, pagepool, nr, get_extent, &em_cached, - &bio, 0, &bio_flags, &prev_em_start); + __extent_readpages(tree, pagepool, nr, &em_cached, &bio, + &bio_flags, &prev_em_start); nr = 0; } if (nr) - __extent_readpages(tree, pagepool, nr, get_extent, &em_cached, - &bio, 0, &bio_flags, &prev_em_start); + __extent_readpages(tree, pagepool, nr, &em_cached, &bio, + &bio_flags, &prev_em_start); if (em_cached) free_extent_map(em_cached); @@ -4209,7 +4198,7 @@ int extent_invalidatepage(struct extent_io_tree *tree, clear_extent_bit(tree, start, end, EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING, - 1, 1, &cached_state, GFP_NOFS); + 1, 1, &cached_state); return 0; } @@ -4234,9 +4223,9 @@ static int try_release_extent_state(struct extent_map_tree *map, * at this point we can safely clear everything except the * locked bit and the nodatasum bit */ - ret = clear_extent_bit(tree, start, end, + ret = __clear_extent_bit(tree, start, end, ~(EXTENT_LOCKED | EXTENT_NODATASUM), - 0, 0, NULL, mask); + 0, 0, NULL, mask, NULL); /* if clear_extent_bit failed for enomem reasons, * we can't allow the release to continue. @@ -4302,9 +4291,7 @@ int try_release_extent_mapping(struct extent_map_tree *map, * This maps until we find something past 'last' */ static struct extent_map *get_extent_skip_holes(struct inode *inode, - u64 offset, - u64 last, - get_extent_t *get_extent) + u64 offset, u64 last) { u64 sectorsize = btrfs_inode_sectorsize(inode); struct extent_map *em; @@ -4318,15 +4305,14 @@ static struct extent_map *get_extent_skip_holes(struct inode *inode, if (len == 0) break; len = ALIGN(len, sectorsize); - em = get_extent(BTRFS_I(inode), NULL, 0, offset, len, 0); + em = btrfs_get_extent_fiemap(BTRFS_I(inode), NULL, 0, offset, + len, 0); if (IS_ERR_OR_NULL(em)) return em; /* if this isn't a hole return it */ - if (!test_bit(EXTENT_FLAG_VACANCY, &em->flags) && - em->block_start != EXTENT_MAP_HOLE) { + if (em->block_start != EXTENT_MAP_HOLE) return em; - } /* this is a hole, advance to the next extent */ offset = extent_map_end(em); @@ -4451,7 +4437,7 @@ static int emit_last_fiemap_cache(struct btrfs_fs_info *fs_info, } int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, - __u64 start, __u64 len, get_extent_t *get_extent) + __u64 start, __u64 len) { int ret = 0; u64 off = start; @@ -4533,8 +4519,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len - 1, &cached_state); - em = get_extent_skip_holes(inode, start, last_for_get_extent, - get_extent); + em = get_extent_skip_holes(inode, start, last_for_get_extent); if (!em) goto out; if (IS_ERR(em)) { @@ -4622,8 +4607,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, } /* now scan forward to see if this is really the last extent. */ - em = get_extent_skip_holes(inode, off, last_for_get_extent, - get_extent); + em = get_extent_skip_holes(inode, off, last_for_get_extent); if (IS_ERR(em)) { ret = PTR_ERR(em); goto out; @@ -5263,8 +5247,7 @@ int extent_buffer_uptodate(struct extent_buffer *eb) } int read_extent_buffer_pages(struct extent_io_tree *tree, - struct extent_buffer *eb, int wait, - get_extent_t *get_extent, int mirror_num) + struct extent_buffer *eb, int wait, int mirror_num) { unsigned long i; struct page *page; @@ -5324,7 +5307,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, ClearPageError(page); err = __extent_read_full_page(tree, page, - get_extent, &bio, + btree_get_extent, &bio, mirror_num, &bio_flags, REQ_META); if (err) { diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 93dcae0c3183..c28f5ef88f42 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -300,19 +300,22 @@ int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, struct extent_changeset *changeset); int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, int wake, int delete, - struct extent_state **cached, gfp_t mask); + struct extent_state **cached); +int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + unsigned bits, int wake, int delete, + struct extent_state **cached, gfp_t mask, + struct extent_changeset *changeset); static inline int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end) { - return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL, - GFP_NOFS); + return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL); } static inline int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached, gfp_t mask) { - return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached, - mask); + return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached, + mask, NULL); } static inline int clear_extent_bits(struct extent_io_tree *tree, u64 start, @@ -323,8 +326,7 @@ static inline int clear_extent_bits(struct extent_io_tree *tree, u64 start, if (bits & EXTENT_LOCKED) wake = 1; - return clear_extent_bit(tree, start, end, bits, wake, 0, NULL, - GFP_NOFS); + return clear_extent_bit(tree, start, end, bits, wake, 0, NULL); } int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, @@ -340,10 +342,10 @@ static inline int set_extent_bits(struct extent_io_tree *tree, u64 start, } static inline int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, - u64 end, struct extent_state **cached_state, gfp_t mask) + u64 end, struct extent_state **cached_state) { - return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, - cached_state, mask); + return __clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, + cached_state, GFP_NOFS, NULL); } static inline int set_extent_dirty(struct extent_io_tree *tree, u64 start, @@ -358,7 +360,7 @@ static inline int clear_extent_dirty(struct extent_io_tree *tree, u64 start, { return clear_extent_bit(tree, start, end, EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING, 0, 0, NULL, GFP_NOFS); + EXTENT_DO_ACCOUNTING, 0, 0, NULL); } int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, @@ -402,23 +404,19 @@ int find_first_extent_bit(struct extent_io_tree *tree, u64 start, int extent_invalidatepage(struct extent_io_tree *tree, struct page *page, unsigned long offset); int extent_write_full_page(struct extent_io_tree *tree, struct page *page, - get_extent_t *get_extent, struct writeback_control *wbc); int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode, - u64 start, u64 end, get_extent_t *get_extent, - int mode); + u64 start, u64 end, int mode); int extent_writepages(struct extent_io_tree *tree, struct address_space *mapping, - get_extent_t *get_extent, struct writeback_control *wbc); int btree_write_cache_pages(struct address_space *mapping, struct writeback_control *wbc); int extent_readpages(struct extent_io_tree *tree, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages, - get_extent_t get_extent); + struct list_head *pages, unsigned nr_pages); int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, - __u64 start, __u64 len, get_extent_t *get_extent); + __u64 start, __u64 len); void set_page_extent_mapped(struct page *page); struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, @@ -437,7 +435,7 @@ void free_extent_buffer_stale(struct extent_buffer *eb); #define WAIT_PAGE_LOCK 2 int read_extent_buffer_pages(struct extent_io_tree *tree, struct extent_buffer *eb, int wait, - get_extent_t *get_extent, int mirror_num); + int mirror_num); void wait_on_extent_buffer_writeback(struct extent_buffer *eb); static inline unsigned long num_extent_pages(u64 start, u64 len) diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index 64365bbc9b16..e9e285d45c7e 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -13,7 +13,6 @@ /* bits for the flags field */ #define EXTENT_FLAG_PINNED 0 /* this entry not yet on disk, don't free it */ #define EXTENT_FLAG_COMPRESSED 1 -#define EXTENT_FLAG_VACANCY 2 /* no file extent item found */ #define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */ #define EXTENT_FLAG_LOGGING 4 /* Logging this extent */ #define EXTENT_FLAG_FILLING 5 /* Filling in a preallocated extent */ diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index eb1bac7c8553..1589961f7bba 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1519,7 +1519,7 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, clear_extent_bit(&inode->io_tree, start_pos, last_pos, EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, - 0, 0, cached_state, GFP_NOFS); + 0, 0, cached_state); *lockstart = start_pos; *lockend = last_pos; ret = 1; @@ -1755,7 +1755,7 @@ again: if (copied > 0) ret = btrfs_dirty_pages(inode, pages, dirty_pages, - pos, copied, NULL); + pos, copied, &cached_state); if (extents_locked) unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, &cached_state, @@ -2019,10 +2019,19 @@ int btrfs_release_file(struct inode *inode, struct file *filp) static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end) { int ret; + struct blk_plug plug; + /* + * This is only called in fsync, which would do synchronous writes, so + * a plug can merge adjacent IOs as much as possible. Esp. in case of + * multiple disks using raid profile, a large IO can be split to + * several segments of stripe length (currently 64K). + */ + blk_start_plug(&plug); atomic_inc(&BTRFS_I(inode)->sync_writers); ret = btrfs_fdatawrite_range(inode, start, end); atomic_dec(&BTRFS_I(inode)->sync_writers); + blk_finish_plug(&plug); return ret; } @@ -2450,6 +2459,46 @@ static int find_first_non_hole(struct inode *inode, u64 *start, u64 *len) return ret; } +static int btrfs_punch_hole_lock_range(struct inode *inode, + const u64 lockstart, + const u64 lockend, + struct extent_state **cached_state) +{ + while (1) { + struct btrfs_ordered_extent *ordered; + int ret; + + truncate_pagecache_range(inode, lockstart, lockend); + + lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, + cached_state); + ordered = btrfs_lookup_first_ordered_extent(inode, lockend); + + /* + * We need to make sure we have no ordered extents in this range + * and nobody raced in and read a page in this range, if we did + * we need to try again. + */ + if ((!ordered || + (ordered->file_offset + ordered->len <= lockstart || + ordered->file_offset > lockend)) && + !btrfs_page_exists_in_range(inode, lockstart, lockend)) { + if (ordered) + btrfs_put_ordered_extent(ordered); + break; + } + if (ordered) + btrfs_put_ordered_extent(ordered); + unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, + lockend, cached_state, GFP_NOFS); + ret = btrfs_wait_ordered_range(inode, lockstart, + lockend - lockstart + 1); + if (ret) + return ret; + } + return 0; +} + static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -2566,38 +2615,11 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) goto out_only_mutex; } - while (1) { - struct btrfs_ordered_extent *ordered; - - truncate_pagecache_range(inode, lockstart, lockend); - - lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, - &cached_state); - ordered = btrfs_lookup_first_ordered_extent(inode, lockend); - - /* - * We need to make sure we have no ordered extents in this range - * and nobody raced in and read a page in this range, if we did - * we need to try again. - */ - if ((!ordered || - (ordered->file_offset + ordered->len <= lockstart || - ordered->file_offset > lockend)) && - !btrfs_page_exists_in_range(inode, lockstart, lockend)) { - if (ordered) - btrfs_put_ordered_extent(ordered); - break; - } - if (ordered) - btrfs_put_ordered_extent(ordered); - unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, - lockend, &cached_state, GFP_NOFS); - ret = btrfs_wait_ordered_range(inode, lockstart, - lockend - lockstart + 1); - if (ret) { - inode_unlock(inode); - return ret; - } + ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend, + &cached_state); + if (ret) { + inode_unlock(inode); + goto out_only_mutex; } path = btrfs_alloc_path(); @@ -2806,6 +2828,217 @@ insert: return 0; } +static int btrfs_fallocate_update_isize(struct inode *inode, + const u64 end, + const int mode) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret; + int ret2; + + if (mode & FALLOC_FL_KEEP_SIZE || end <= i_size_read(inode)) + return 0; + + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + inode->i_ctime = current_time(inode); + i_size_write(inode, end); + btrfs_ordered_update_i_size(inode, end, NULL); + ret = btrfs_update_inode(trans, root, inode); + ret2 = btrfs_end_transaction(trans); + + return ret ? ret : ret2; +} + +static int btrfs_zero_range_check_range_boundary(struct inode *inode, + u64 offset) +{ + const u64 sectorsize = btrfs_inode_sectorsize(inode); + struct extent_map *em; + int ret = 0; + + offset = round_down(offset, sectorsize); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); + if (IS_ERR(em)) + return PTR_ERR(em); + + if (em->block_start == EXTENT_MAP_HOLE) + ret = 1; + + free_extent_map(em); + return ret; +} + +static int btrfs_zero_range(struct inode *inode, + loff_t offset, + loff_t len, + const int mode) +{ + struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; + struct extent_map *em; + struct extent_changeset *data_reserved = NULL; + int ret; + u64 alloc_hint = 0; + const u64 sectorsize = btrfs_inode_sectorsize(inode); + u64 alloc_start = round_down(offset, sectorsize); + u64 alloc_end = round_up(offset + len, sectorsize); + u64 bytes_to_reserve = 0; + bool space_reserved = false; + + inode_dio_wait(inode); + + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, + alloc_start, alloc_end - alloc_start, 0); + if (IS_ERR(em)) { + ret = PTR_ERR(em); + goto out; + } + + /* + * Avoid hole punching and extent allocation for some cases. More cases + * could be considered, but these are unlikely common and we keep things + * as simple as possible for now. Also, intentionally, if the target + * range contains one or more prealloc extents together with regular + * extents and holes, we drop all the existing extents and allocate a + * new prealloc extent, so that we get a larger contiguous disk extent. + */ + if (em->start <= alloc_start && + test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { + const u64 em_end = em->start + em->len; + + if (em_end >= offset + len) { + /* + * The whole range is already a prealloc extent, + * do nothing except updating the inode's i_size if + * needed. + */ + free_extent_map(em); + ret = btrfs_fallocate_update_isize(inode, offset + len, + mode); + goto out; + } + /* + * Part of the range is already a prealloc extent, so operate + * only on the remaining part of the range. + */ + alloc_start = em_end; + ASSERT(IS_ALIGNED(alloc_start, sectorsize)); + len = offset + len - alloc_start; + offset = alloc_start; + alloc_hint = em->block_start + em->len; + } + free_extent_map(em); + + if (BTRFS_BYTES_TO_BLKS(fs_info, offset) == + BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)) { + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, + alloc_start, sectorsize, 0); + if (IS_ERR(em)) { + ret = PTR_ERR(em); + goto out; + } + + if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { + free_extent_map(em); + ret = btrfs_fallocate_update_isize(inode, offset + len, + mode); + goto out; + } + if (len < sectorsize && em->block_start != EXTENT_MAP_HOLE) { + free_extent_map(em); + ret = btrfs_truncate_block(inode, offset, len, 0); + if (!ret) + ret = btrfs_fallocate_update_isize(inode, + offset + len, + mode); + return ret; + } + free_extent_map(em); + alloc_start = round_down(offset, sectorsize); + alloc_end = alloc_start + sectorsize; + goto reserve_space; + } + + alloc_start = round_up(offset, sectorsize); + alloc_end = round_down(offset + len, sectorsize); + + /* + * For unaligned ranges, check the pages at the boundaries, they might + * map to an extent, in which case we need to partially zero them, or + * they might map to a hole, in which case we need our allocation range + * to cover them. + */ + if (!IS_ALIGNED(offset, sectorsize)) { + ret = btrfs_zero_range_check_range_boundary(inode, offset); + if (ret < 0) + goto out; + if (ret) { + alloc_start = round_down(offset, sectorsize); + ret = 0; + } else { + ret = btrfs_truncate_block(inode, offset, 0, 0); + if (ret) + goto out; + } + } + + if (!IS_ALIGNED(offset + len, sectorsize)) { + ret = btrfs_zero_range_check_range_boundary(inode, + offset + len); + if (ret < 0) + goto out; + if (ret) { + alloc_end = round_up(offset + len, sectorsize); + ret = 0; + } else { + ret = btrfs_truncate_block(inode, offset + len, 0, 1); + if (ret) + goto out; + } + } + +reserve_space: + if (alloc_start < alloc_end) { + struct extent_state *cached_state = NULL; + const u64 lockstart = alloc_start; + const u64 lockend = alloc_end - 1; + + bytes_to_reserve = alloc_end - alloc_start; + ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), + bytes_to_reserve); + if (ret < 0) + goto out; + space_reserved = true; + ret = btrfs_qgroup_reserve_data(inode, &data_reserved, + alloc_start, bytes_to_reserve); + if (ret) + goto out; + ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend, + &cached_state); + if (ret) + goto out; + ret = btrfs_prealloc_file_range(inode, mode, alloc_start, + alloc_end - alloc_start, + i_blocksize(inode), + offset + len, &alloc_hint); + unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, + lockend, &cached_state, GFP_KERNEL); + /* btrfs_prealloc_file_range releases reserved space on error */ + if (ret) + space_reserved = false; + } + out: + if (ret && space_reserved) + btrfs_free_reserved_data_space(inode, data_reserved, + alloc_start, bytes_to_reserve); + extent_changeset_free(data_reserved); + + return ret; +} + static long btrfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { @@ -2831,7 +3064,8 @@ static long btrfs_fallocate(struct file *file, int mode, cur_offset = alloc_start; /* Make sure we aren't being give some crap mode */ - if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | + FALLOC_FL_ZERO_RANGE)) return -EOPNOTSUPP; if (mode & FALLOC_FL_PUNCH_HOLE) @@ -2842,10 +3076,12 @@ static long btrfs_fallocate(struct file *file, int mode, * * For qgroup space, it will be checked later. */ - ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), - alloc_end - alloc_start); - if (ret < 0) - return ret; + if (!(mode & FALLOC_FL_ZERO_RANGE)) { + ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), + alloc_end - alloc_start); + if (ret < 0) + return ret; + } inode_lock(inode); @@ -2887,6 +3123,12 @@ static long btrfs_fallocate(struct file *file, int mode, if (ret) goto out; + if (mode & FALLOC_FL_ZERO_RANGE) { + ret = btrfs_zero_range(inode, offset, len, mode); + inode_unlock(inode); + return ret; + } + locked_end = alloc_end - 1; while (1) { struct btrfs_ordered_extent *ordered; @@ -2896,8 +3138,8 @@ static long btrfs_fallocate(struct file *file, int mode, */ lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, &cached_state); - ordered = btrfs_lookup_first_ordered_extent(inode, - alloc_end - 1); + ordered = btrfs_lookup_first_ordered_extent(inode, locked_end); + if (ordered && ordered->file_offset + ordered->len > alloc_start && ordered->file_offset < alloc_end) { @@ -2922,7 +3164,7 @@ static long btrfs_fallocate(struct file *file, int mode, /* First, check if we exceed the qgroup limit */ INIT_LIST_HEAD(&reserve_list); - while (1) { + while (cur_offset < alloc_end) { em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset, alloc_end - cur_offset, 0); if (IS_ERR(em)) { @@ -2958,8 +3200,6 @@ static long btrfs_fallocate(struct file *file, int mode, } free_extent_map(em); cur_offset = last_byte; - if (cur_offset >= alloc_end) - break; } /* @@ -2982,37 +3222,18 @@ static long btrfs_fallocate(struct file *file, int mode, if (ret < 0) goto out_unlock; - if (actual_end > inode->i_size && - !(mode & FALLOC_FL_KEEP_SIZE)) { - struct btrfs_trans_handle *trans; - struct btrfs_root *root = BTRFS_I(inode)->root; - - /* - * We didn't need to allocate any more space, but we - * still extended the size of the file so we need to - * update i_size and the inode item. - */ - trans = btrfs_start_transaction(root, 1); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - } else { - inode->i_ctime = current_time(inode); - i_size_write(inode, actual_end); - btrfs_ordered_update_i_size(inode, actual_end, NULL); - ret = btrfs_update_inode(trans, root, inode); - if (ret) - btrfs_end_transaction(trans); - else - ret = btrfs_end_transaction(trans); - } - } + /* + * We didn't need to allocate any more space, but we still extended the + * size of the file so we need to update i_size and the inode item. + */ + ret = btrfs_fallocate_update_isize(inode, actual_end, mode); out_unlock: unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, &cached_state, GFP_KERNEL); out: inode_unlock(inode); /* Let go of our reservation. */ - if (ret != 0) + if (ret != 0 && !(mode & FALLOC_FL_ZERO_RANGE)) btrfs_free_reserved_data_space(inode, data_reserved, alloc_start, alloc_end - cur_offset); extent_changeset_free(data_reserved); @@ -3145,7 +3366,7 @@ void btrfs_auto_defrag_exit(void) kmem_cache_destroy(btrfs_inode_defrag_cachep); } -int btrfs_auto_defrag_init(void) +int __init btrfs_auto_defrag_init(void) { btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag", sizeof(struct inode_defrag), 0, diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 4426d1c73e50..b8ab90c9a9fb 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -993,8 +993,7 @@ update_cache_item(struct btrfs_trans_handle *trans, ret = btrfs_search_slot(trans, root, &key, path, 0, 1); if (ret < 0) { clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, - EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL, - GFP_NOFS); + EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL); goto fail; } leaf = path->nodes[0]; @@ -1008,7 +1007,7 @@ update_cache_item(struct btrfs_trans_handle *trans, clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, - NULL, GFP_NOFS); + NULL); btrfs_release_path(path); goto fail; } @@ -1105,8 +1104,7 @@ static int flush_dirty_cache(struct inode *inode) ret = btrfs_wait_ordered_range(inode, 0, (u64)-1); if (ret) clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, - EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL, - GFP_NOFS); + EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL); return ret; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 993061f83067..57785eadb95c 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -769,7 +769,6 @@ retry: inode, async_extent->start, async_extent->start + async_extent->ram_size - 1, - btrfs_get_extent, WB_SYNC_ALL); else if (ret) unlock_page(async_cow->locked_page); @@ -1203,7 +1202,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page, u64 cur_end; clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED, - 1, 0, NULL, GFP_NOFS); + 1, 0, NULL); while (start < end) { async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS); BUG_ON(!async_cow); /* -ENOMEM */ @@ -1951,7 +1950,21 @@ static blk_status_t __btrfs_submit_bio_done(void *private_data, struct bio *bio, /* * extent_io.c submission hook. This does the right thing for csum calculation - * on write, or reading the csums from the tree before a read + * on write, or reading the csums from the tree before a read. + * + * Rules about async/sync submit, + * a) read: sync submit + * + * b) write without checksum: sync submit + * + * c) write with checksum: + * c-1) if bio is issued by fsync: sync submit + * (sync_writers != 0) + * + * c-2) if root is reloc root: sync submit + * (only in case of buffered IO) + * + * c-3) otherwise: async submit */ static blk_status_t btrfs_submit_bio_hook(void *private_data, struct bio *bio, int mirror_num, unsigned long bio_flags, @@ -2023,10 +2036,10 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans, struct btrfs_ordered_sum *sum; list_for_each_entry(sum, list, list) { - trans->adding_csums = 1; + trans->adding_csums = true; btrfs_csum_file_blocks(trans, BTRFS_I(inode)->root->fs_info->csum_root, sum); - trans->adding_csums = 0; + trans->adding_csums = false; } return 0; } @@ -2986,7 +2999,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) clear_extent_bit(io_tree, ordered_extent->file_offset, ordered_extent->file_offset + ordered_extent->len - 1, - EXTENT_DEFRAG, 0, 0, &cached_state, GFP_NOFS); + EXTENT_DEFRAG, 0, 0, &cached_state); } if (nolock) @@ -3005,6 +3018,8 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) compress_type = ordered_extent->compress_type; if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { BUG_ON(compress_type); + btrfs_qgroup_free_data(inode, NULL, ordered_extent->file_offset, + ordered_extent->len); ret = btrfs_mark_extent_written(trans, BTRFS_I(inode), ordered_extent->file_offset, ordered_extent->file_offset + @@ -3054,7 +3069,7 @@ out: ordered_extent->len - 1, clear_bits, (clear_bits & EXTENT_LOCKED) ? 1 : 0, - 0, &cached_state, GFP_NOFS); + 0, &cached_state); } if (trans) @@ -3068,7 +3083,7 @@ out: else start = ordered_extent->file_offset; end = ordered_extent->file_offset + ordered_extent->len - 1; - clear_extent_uptodate(io_tree, start, end, NULL, GFP_NOFS); + clear_extent_uptodate(io_tree, start, end, NULL); /* Drop the cache for the part of the extent we didn't write. */ btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 0); @@ -4796,7 +4811,7 @@ again: clear_extent_bit(&BTRFS_I(inode)->io_tree, block_start, block_end, EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, - 0, 0, &cached_state, GFP_NOFS); + 0, 0, &cached_state); ret = btrfs_set_extent_delalloc(inode, block_start, block_end, 0, &cached_state, 0); @@ -5232,8 +5247,7 @@ static void evict_inode_truncate_pages(struct inode *inode) clear_extent_bit(io_tree, start, end, EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | - EXTENT_DEFRAG, 1, 1, - &cached_state, GFP_NOFS); + EXTENT_DEFRAG, 1, 1, &cached_state); cond_resched(); spin_lock(&io_tree->lock); @@ -5892,7 +5906,6 @@ static int btrfs_filldir(void *addr, int entries, struct dir_context *ctx) static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) { struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_file_private *private = file->private_data; struct btrfs_dir_item *di; @@ -5960,9 +5973,6 @@ again: if (btrfs_should_delete_dir_index(&del_list, found_key.offset)) goto next; di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); - if (verify_dir_item(fs_info, leaf, slot, di)) - goto next; - name_len = btrfs_dir_name_len(leaf, di); if ((total_len + sizeof(struct dir_entry) + name_len) >= PAGE_SIZE) { @@ -6558,7 +6568,6 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, out_unlock: btrfs_end_transaction(trans); - btrfs_balance_delayed_items(fs_info); btrfs_btree_balance_dirty(fs_info); if (drop_inode) { inode_dec_link_count(inode); @@ -6639,7 +6648,6 @@ out_unlock: inode_dec_link_count(inode); iput(inode); } - btrfs_balance_delayed_items(fs_info); btrfs_btree_balance_dirty(fs_info); return err; @@ -6714,7 +6722,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent); } - btrfs_balance_delayed_items(fs_info); fail: if (trans) btrfs_end_transaction(trans); @@ -6792,7 +6799,6 @@ out_fail: inode_dec_link_count(inode); iput(inode); } - btrfs_balance_delayed_items(fs_info); btrfs_btree_balance_dirty(fs_info); return err; @@ -6940,7 +6946,6 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, struct btrfs_trans_handle *trans = NULL; const bool new_inline = !page || create; -again: read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, start, len); if (em) @@ -7081,7 +7086,7 @@ next: em->orig_block_len = em->len; em->orig_start = em->start; ptr = btrfs_file_extent_inline_start(item) + extent_offset; - if (create == 0 && !PageUptodate(page)) { + if (!PageUptodate(page)) { if (btrfs_file_extent_compression(leaf, item) != BTRFS_COMPRESS_NONE) { ret = uncompress_inline(path, page, pg_offset, @@ -7102,25 +7107,6 @@ next: kunmap(page); } flush_dcache_page(page); - } else if (create && PageUptodate(page)) { - BUG(); - if (!trans) { - kunmap(page); - free_extent_map(em); - em = NULL; - - btrfs_release_path(path); - trans = btrfs_join_transaction(root); - - if (IS_ERR(trans)) - return ERR_CAST(trans); - goto again; - } - map = kmap(page); - write_extent_buffer(leaf, map + pg_offset, ptr, - copy_size); - kunmap(page); - btrfs_mark_buffer_dirty(leaf); } set_extent_uptodate(io_tree, em->start, extent_map_end(em) - 1, NULL, GFP_NOFS); @@ -7132,7 +7118,6 @@ not_found: em->len = len; not_found_em: em->block_start = EXTENT_MAP_HOLE; - set_bit(EXTENT_FLAG_VACANCY, &em->flags); insert: btrfs_release_path(path); if (em->start > start || extent_map_end(em) <= start) { @@ -7924,7 +7909,7 @@ unlock: if (lockstart < lockend) { clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, unlock_bits, 1, 0, - &cached_state, GFP_NOFS); + &cached_state); } else { free_extent_state(cached_state); } @@ -7935,7 +7920,7 @@ unlock: unlock_err: clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, - unlock_bits, 1, 0, &cached_state, GFP_NOFS); + unlock_bits, 1, 0, &cached_state); err: if (dio_data) current->journal_info = dio_data; @@ -8458,6 +8443,7 @@ __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, u64 file_offset, bool write = bio_op(bio) == REQ_OP_WRITE; blk_status_t ret; + /* Check btrfs_submit_bio_hook() for rules about async submit. */ if (async_submit) async_submit = !atomic_read(&BTRFS_I(inode)->sync_writers); @@ -8852,7 +8838,7 @@ static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, if (ret) return ret; - return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent_fiemap); + return extent_fiemap(inode, fieinfo, start, len); } int btrfs_readpage(struct file *file, struct page *page) @@ -8884,7 +8870,7 @@ static int btrfs_writepage(struct page *page, struct writeback_control *wbc) return AOP_WRITEPAGE_ACTIVATE; } tree = &BTRFS_I(page->mapping->host)->io_tree; - ret = extent_write_full_page(tree, page, btrfs_get_extent, wbc); + ret = extent_write_full_page(tree, page, wbc); btrfs_add_delayed_iput(inode); return ret; } @@ -8895,7 +8881,7 @@ static int btrfs_writepages(struct address_space *mapping, struct extent_io_tree *tree; tree = &BTRFS_I(mapping->host)->io_tree; - return extent_writepages(tree, mapping, btrfs_get_extent, wbc); + return extent_writepages(tree, mapping, wbc); } static int @@ -8904,8 +8890,7 @@ btrfs_readpages(struct file *file, struct address_space *mapping, { struct extent_io_tree *tree; tree = &BTRFS_I(mapping->host)->io_tree; - return extent_readpages(tree, mapping, pages, nr_pages, - btrfs_get_extent); + return extent_readpages(tree, mapping, pages, nr_pages); } static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags) { @@ -8976,8 +8961,7 @@ again: EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | - EXTENT_DEFRAG, 1, 0, &cached_state, - GFP_NOFS); + EXTENT_DEFRAG, 1, 0, &cached_state); /* * whoever cleared the private bit is responsible * for the finish_ordered_io @@ -9034,7 +9018,7 @@ again: EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1, - &cached_state, GFP_NOFS); + &cached_state); __btrfs_releasepage(page, GFP_NOFS); } @@ -9162,7 +9146,7 @@ again: clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end, EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, - 0, 0, &cached_state, GFP_NOFS); + 0, 0, &cached_state); ret = btrfs_set_extent_delalloc(inode, page_start, end, 0, &cached_state, 0); @@ -9419,7 +9403,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) struct btrfs_inode *ei; struct inode *inode; - ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS); + ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_KERNEL); if (!ei) return NULL; @@ -9571,7 +9555,7 @@ void btrfs_destroy_cachep(void) kmem_cache_destroy(btrfs_free_space_cachep); } -int btrfs_init_cachep(void) +int __init btrfs_init_cachep(void) { btrfs_inode_cachep = kmem_cache_create("btrfs_inode", sizeof(struct btrfs_inode), 0, @@ -10686,7 +10670,6 @@ out: btrfs_end_transaction(trans); if (ret) iput(inode); - btrfs_balance_delayed_items(fs_info); btrfs_btree_balance_dirty(fs_info); return ret; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index d748ad1c3620..2f7562b1e349 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1190,7 +1190,7 @@ again: clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end - 1, EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, - &cached_state, GFP_NOFS); + &cached_state); if (i_done != page_cnt) { spin_lock(&BTRFS_I(inode)->lock); @@ -1760,6 +1760,7 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file, struct btrfs_trans_handle *trans; u64 root_flags; u64 flags; + bool clear_received_uuid = false; int ret = 0; if (!inode_owner_or_capable(inode)) @@ -1809,6 +1810,7 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file, btrfs_set_root_flags(&root->root_item, root_flags & ~BTRFS_ROOT_SUBVOL_RDONLY); spin_unlock(&root->root_item_lock); + clear_received_uuid = true; } else { spin_unlock(&root->root_item_lock); btrfs_warn(fs_info, @@ -1825,6 +1827,24 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file, goto out_reset; } + if (clear_received_uuid) { + if (!btrfs_is_empty_uuid(root->root_item.received_uuid)) { + ret = btrfs_uuid_tree_rem(trans, fs_info, + root->root_item.received_uuid, + BTRFS_UUID_KEY_RECEIVED_SUBVOL, + root->root_key.objectid); + + if (ret && ret != -ENOENT) { + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); + goto out_reset; + } + + memset(root->root_item.received_uuid, 0, + BTRFS_UUID_SIZE); + } + } + ret = btrfs_update_root(trans, fs_info->tree_root, &root->root_key, &root->root_item); if (ret < 0) { @@ -2675,14 +2695,12 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) goto out; } - mutex_lock(&fs_info->volume_mutex); if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) { ret = btrfs_rm_device(fs_info, NULL, vol_args->devid); } else { vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0'; ret = btrfs_rm_device(fs_info, vol_args->name, 0); } - mutex_unlock(&fs_info->volume_mutex); clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); if (!ret) { @@ -2726,9 +2744,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) } vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; - mutex_lock(&fs_info->volume_mutex); ret = btrfs_rm_device(fs_info, vol_args->name, 0); - mutex_unlock(&fs_info->volume_mutex); if (!ret) btrfs_info(fs_info, "disk deleted %s", vol_args->name); @@ -2753,16 +2769,16 @@ static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info, if (!fi_args) return -ENOMEM; - mutex_lock(&fs_devices->device_list_mutex); + rcu_read_lock(); fi_args->num_devices = fs_devices->num_devices; - memcpy(&fi_args->fsid, fs_info->fsid, sizeof(fi_args->fsid)); - list_for_each_entry(device, &fs_devices->devices, dev_list) { + list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) { if (device->devid > fi_args->max_id) fi_args->max_id = device->devid; } - mutex_unlock(&fs_devices->device_list_mutex); + rcu_read_unlock(); + memcpy(&fi_args->fsid, fs_info->fsid, sizeof(fi_args->fsid)); fi_args->nodesize = fs_info->nodesize; fi_args->sectorsize = fs_info->sectorsize; fi_args->clone_alignment = fs_info->sectorsize; @@ -2779,7 +2795,6 @@ static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info, { struct btrfs_ioctl_dev_info_args *di_args; struct btrfs_device *dev; - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; int ret = 0; char *s_uuid = NULL; @@ -2790,7 +2805,7 @@ static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info, if (!btrfs_is_empty_uuid(di_args->uuid)) s_uuid = di_args->uuid; - mutex_lock(&fs_devices->device_list_mutex); + rcu_read_lock(); dev = btrfs_find_device(fs_info, di_args->devid, s_uuid, NULL); if (!dev) { @@ -2805,17 +2820,15 @@ static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info, if (dev->name) { struct rcu_string *name; - rcu_read_lock(); name = rcu_dereference(dev->name); strncpy(di_args->path, name->str, sizeof(di_args->path)); - rcu_read_unlock(); di_args->path[sizeof(di_args->path) - 1] = 0; } else { di_args->path[0] = '\0'; } out: - mutex_unlock(&fs_devices->device_list_mutex); + rcu_read_unlock(); if (ret == 0 && copy_to_user(arg, di_args, sizeof(*di_args))) ret = -EFAULT; diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index f6a05f836629..4f23ff00c8b3 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -164,7 +164,6 @@ static int iterate_object_props(struct btrfs_root *root, size_t), void *ctx) { - struct btrfs_fs_info *fs_info = root->fs_info; int ret; char *name_buf = NULL; char *value_buf = NULL; @@ -215,12 +214,6 @@ static int iterate_object_props(struct btrfs_root *root, name_ptr = (unsigned long)(di + 1); data_ptr = name_ptr + name_len; - if (verify_dir_item(fs_info, leaf, - path->slots[0], di)) { - ret = -EIO; - goto out; - } - if (name_len <= XATTR_BTRFS_PREFIX_LEN || memcmp_extent_buffer(leaf, XATTR_BTRFS_PREFIX, name_ptr, @@ -414,7 +407,7 @@ static int prop_compression_apply(struct inode *inode, type = BTRFS_COMPRESS_LZO; else if (!strncmp("zlib", value, 4)) type = BTRFS_COMPRESS_ZLIB; - else if (!strncmp("zstd", value, len)) + else if (!strncmp("zstd", value, 4)) type = BTRFS_COMPRESS_ZSTD; else return -EINVAL; diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 168fd03ca3ac..c02670d5aa13 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -47,6 +47,80 @@ * - check all ioctl parameters */ +/* + * Helpers to access qgroup reservation + * + * Callers should ensure the lock context and type are valid + */ + +static u64 qgroup_rsv_total(const struct btrfs_qgroup *qgroup) +{ + u64 ret = 0; + int i; + + for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++) + ret += qgroup->rsv.values[i]; + + return ret; +} + +#ifdef CONFIG_BTRFS_DEBUG +static const char *qgroup_rsv_type_str(enum btrfs_qgroup_rsv_type type) +{ + if (type == BTRFS_QGROUP_RSV_DATA) + return "data"; + if (type == BTRFS_QGROUP_RSV_META) + return "meta"; + return NULL; +} +#endif + +static void qgroup_rsv_add(struct btrfs_fs_info *fs_info, + struct btrfs_qgroup *qgroup, u64 num_bytes, + enum btrfs_qgroup_rsv_type type) +{ + trace_qgroup_update_reserve(fs_info, qgroup, num_bytes, type); + qgroup->rsv.values[type] += num_bytes; +} + +static void qgroup_rsv_release(struct btrfs_fs_info *fs_info, + struct btrfs_qgroup *qgroup, u64 num_bytes, + enum btrfs_qgroup_rsv_type type) +{ + trace_qgroup_update_reserve(fs_info, qgroup, -(s64)num_bytes, type); + if (qgroup->rsv.values[type] >= num_bytes) { + qgroup->rsv.values[type] -= num_bytes; + return; + } +#ifdef CONFIG_BTRFS_DEBUG + WARN_RATELIMIT(1, + "qgroup %llu %s reserved space underflow, have %llu to free %llu", + qgroup->qgroupid, qgroup_rsv_type_str(type), + qgroup->rsv.values[type], num_bytes); +#endif + qgroup->rsv.values[type] = 0; +} + +static void qgroup_rsv_add_by_qgroup(struct btrfs_fs_info *fs_info, + struct btrfs_qgroup *dest, + struct btrfs_qgroup *src) +{ + int i; + + for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++) + qgroup_rsv_add(fs_info, dest, src->rsv.values[i], i); +} + +static void qgroup_rsv_release_by_qgroup(struct btrfs_fs_info *fs_info, + struct btrfs_qgroup *dest, + struct btrfs_qgroup *src) +{ + int i; + + for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++) + qgroup_rsv_release(fs_info, dest, src->rsv.values[i], i); +} + static void btrfs_qgroup_update_old_refcnt(struct btrfs_qgroup *qg, u64 seq, int mod) { @@ -991,33 +1065,29 @@ static void qgroup_dirty(struct btrfs_fs_info *fs_info, list_add(&qgroup->dirty, &fs_info->dirty_qgroups); } -static void report_reserved_underflow(struct btrfs_fs_info *fs_info, - struct btrfs_qgroup *qgroup, - u64 num_bytes) -{ -#ifdef CONFIG_BTRFS_DEBUG - WARN_ON(qgroup->reserved < num_bytes); - btrfs_debug(fs_info, - "qgroup %llu reserved space underflow, have: %llu, to free: %llu", - qgroup->qgroupid, qgroup->reserved, num_bytes); -#endif - qgroup->reserved = 0; -} /* - * The easy accounting, if we are adding/removing the only ref for an extent - * then this qgroup and all of the parent qgroups get their reference and - * exclusive counts adjusted. + * The easy accounting, we're updating qgroup relationship whose child qgroup + * only has exclusive extents. + * + * In this case, all exclsuive extents will also be exlusive for parent, so + * excl/rfer just get added/removed. + * + * So is qgroup reservation space, which should also be added/removed to + * parent. + * Or when child tries to release reservation space, parent will underflow its + * reservation (for relationship adding case). * * Caller should hold fs_info->qgroup_lock. */ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, struct ulist *tmp, u64 ref_root, - u64 num_bytes, int sign) + struct btrfs_qgroup *src, int sign) { struct btrfs_qgroup *qgroup; struct btrfs_qgroup_list *glist; struct ulist_node *unode; struct ulist_iterator uiter; + u64 num_bytes = src->excl; int ret = 0; qgroup = find_qgroup_rb(fs_info, ref_root); @@ -1030,13 +1100,11 @@ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, WARN_ON(sign < 0 && qgroup->excl < num_bytes); qgroup->excl += sign * num_bytes; qgroup->excl_cmpr += sign * num_bytes; - if (sign > 0) { - trace_qgroup_update_reserve(fs_info, qgroup, -(s64)num_bytes); - if (qgroup->reserved < num_bytes) - report_reserved_underflow(fs_info, qgroup, num_bytes); - else - qgroup->reserved -= num_bytes; - } + + if (sign > 0) + qgroup_rsv_add_by_qgroup(fs_info, qgroup, src); + else + qgroup_rsv_release_by_qgroup(fs_info, qgroup, src); qgroup_dirty(fs_info, qgroup); @@ -1056,15 +1124,10 @@ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, qgroup->rfer_cmpr += sign * num_bytes; WARN_ON(sign < 0 && qgroup->excl < num_bytes); qgroup->excl += sign * num_bytes; - if (sign > 0) { - trace_qgroup_update_reserve(fs_info, qgroup, - -(s64)num_bytes); - if (qgroup->reserved < num_bytes) - report_reserved_underflow(fs_info, qgroup, - num_bytes); - else - qgroup->reserved -= num_bytes; - } + if (sign > 0) + qgroup_rsv_add_by_qgroup(fs_info, qgroup, src); + else + qgroup_rsv_release_by_qgroup(fs_info, qgroup, src); qgroup->excl_cmpr += sign * num_bytes; qgroup_dirty(fs_info, qgroup); @@ -1107,7 +1170,7 @@ static int quick_update_accounting(struct btrfs_fs_info *fs_info, if (qgroup->excl == qgroup->rfer) { ret = 0; err = __qgroup_excl_accounting(fs_info, tmp, dst, - qgroup->excl, sign); + qgroup, sign); if (err < 0) { ret = err; goto out; @@ -2333,17 +2396,18 @@ out: static bool qgroup_check_limits(const struct btrfs_qgroup *qg, u64 num_bytes) { if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) && - qg->reserved + (s64)qg->rfer + num_bytes > qg->max_rfer) + qgroup_rsv_total(qg) + (s64)qg->rfer + num_bytes > qg->max_rfer) return false; if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) && - qg->reserved + (s64)qg->excl + num_bytes > qg->max_excl) + qgroup_rsv_total(qg) + (s64)qg->excl + num_bytes > qg->max_excl) return false; return true; } -static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce) +static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce, + enum btrfs_qgroup_rsv_type type) { struct btrfs_root *quota_root; struct btrfs_qgroup *qgroup; @@ -2395,7 +2459,7 @@ retry: * Commit the tree and retry, since we may have * deletions which would free up space. */ - if (!retried && qg->reserved > 0) { + if (!retried && qgroup_rsv_total(qg) > 0) { struct btrfs_trans_handle *trans; spin_unlock(&fs_info->qgroup_lock); @@ -2434,8 +2498,8 @@ retry: qg = unode_aux_to_qgroup(unode); - trace_qgroup_update_reserve(fs_info, qg, num_bytes); - qg->reserved += num_bytes; + trace_qgroup_update_reserve(fs_info, qg, num_bytes, type); + qgroup_rsv_add(fs_info, qg, num_bytes, type); } out: @@ -2444,7 +2508,8 @@ out: } void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info, - u64 ref_root, u64 num_bytes) + u64 ref_root, u64 num_bytes, + enum btrfs_qgroup_rsv_type type) { struct btrfs_root *quota_root; struct btrfs_qgroup *qgroup; @@ -2480,11 +2545,8 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info, qg = unode_aux_to_qgroup(unode); - trace_qgroup_update_reserve(fs_info, qg, -(s64)num_bytes); - if (qg->reserved < num_bytes) - report_reserved_underflow(fs_info, qg, num_bytes); - else - qg->reserved -= num_bytes; + trace_qgroup_update_reserve(fs_info, qg, -(s64)num_bytes, type); + qgroup_rsv_release(fs_info, qg, num_bytes, type); list_for_each_entry(glist, &qg->groups, next_group) { ret = ulist_add(fs_info->qgroup_ulist, @@ -2872,7 +2934,7 @@ int btrfs_qgroup_reserve_data(struct inode *inode, to_reserve, QGROUP_RESERVE); if (ret < 0) goto cleanup; - ret = qgroup_reserve(root, to_reserve, true); + ret = qgroup_reserve(root, to_reserve, true, BTRFS_QGROUP_RSV_DATA); if (ret < 0) goto cleanup; @@ -2883,8 +2945,7 @@ cleanup: ULIST_ITER_INIT(&uiter); while ((unode = ulist_next(&reserved->range_changed, &uiter))) clear_extent_bit(&BTRFS_I(inode)->io_tree, unode->val, - unode->aux, EXTENT_QGROUP_RESERVED, 0, 0, NULL, - GFP_NOFS); + unode->aux, EXTENT_QGROUP_RESERVED, 0, 0, NULL); extent_changeset_release(reserved); return ret; } @@ -2936,7 +2997,8 @@ static int qgroup_free_reserved_data(struct inode *inode, goto out; freed += changeset.bytes_changed; } - btrfs_qgroup_free_refroot(root->fs_info, root->objectid, freed); + btrfs_qgroup_free_refroot(root->fs_info, root->objectid, freed, + BTRFS_QGROUP_RSV_DATA); ret = freed; out: extent_changeset_release(&changeset); @@ -2968,7 +3030,7 @@ static int __btrfs_qgroup_release_data(struct inode *inode, if (free) btrfs_qgroup_free_refroot(BTRFS_I(inode)->root->fs_info, BTRFS_I(inode)->root->objectid, - changeset.bytes_changed); + changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA); ret = changeset.bytes_changed; out: extent_changeset_release(&changeset); @@ -3025,7 +3087,7 @@ int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize)); trace_qgroup_meta_reserve(root, (s64)num_bytes); - ret = qgroup_reserve(root, num_bytes, enforce); + ret = qgroup_reserve(root, num_bytes, enforce, BTRFS_QGROUP_RSV_META); if (ret < 0) return ret; atomic64_add(num_bytes, &root->qgroup_meta_rsv); @@ -3045,7 +3107,8 @@ void btrfs_qgroup_free_meta_all(struct btrfs_root *root) if (reserved == 0) return; trace_qgroup_meta_reserve(root, -(s64)reserved); - btrfs_qgroup_free_refroot(fs_info, root->objectid, reserved); + btrfs_qgroup_free_refroot(fs_info, root->objectid, reserved, + BTRFS_QGROUP_RSV_META); } void btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes) @@ -3060,7 +3123,8 @@ void btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes) WARN_ON(atomic64_read(&root->qgroup_meta_rsv) < num_bytes); atomic64_sub(num_bytes, &root->qgroup_meta_rsv); trace_qgroup_meta_reserve(root, -(s64)num_bytes); - btrfs_qgroup_free_refroot(fs_info, root->objectid, num_bytes); + btrfs_qgroup_free_refroot(fs_info, root->objectid, num_bytes, + BTRFS_QGROUP_RSV_META); } /* @@ -3088,7 +3152,7 @@ void btrfs_qgroup_check_reserved_leak(struct inode *inode) } btrfs_qgroup_free_refroot(BTRFS_I(inode)->root->fs_info, BTRFS_I(inode)->root->objectid, - changeset.bytes_changed); + changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA); } extent_changeset_release(&changeset); diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index d9984e87cddf..c8c81b923674 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -61,6 +61,26 @@ struct btrfs_qgroup_extent_record { struct ulist *old_roots; }; +enum btrfs_qgroup_rsv_type { + BTRFS_QGROUP_RSV_DATA = 0, + BTRFS_QGROUP_RSV_META, + BTRFS_QGROUP_RSV_LAST, +}; + +/* + * Represents how many bytes we have reserved for this qgroup. + * + * Each type should have different reservation behavior. + * E.g, data follows its io_tree flag modification, while + * *currently* meta is just reserve-and-clear during transcation. + * + * TODO: Add new type for reservation which can survive transaction commit. + * Currect metadata reservation behavior is not suitable for such case. + */ +struct btrfs_qgroup_rsv { + u64 values[BTRFS_QGROUP_RSV_LAST]; +}; + /* * one struct for each qgroup, organized in fs_info->qgroup_tree. */ @@ -87,7 +107,7 @@ struct btrfs_qgroup { /* * reservation tracking */ - u64 reserved; + struct btrfs_qgroup_rsv rsv; /* * lists @@ -228,12 +248,14 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid, struct btrfs_qgroup_inherit *inherit); void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info, - u64 ref_root, u64 num_bytes); + u64 ref_root, u64 num_bytes, + enum btrfs_qgroup_rsv_type type); static inline void btrfs_qgroup_free_delayed_ref(struct btrfs_fs_info *fs_info, u64 ref_root, u64 num_bytes) { trace_btrfs_qgroup_free_delayed_ref(fs_info, ref_root, num_bytes); - btrfs_qgroup_free_refroot(fs_info, ref_root, num_bytes); + btrfs_qgroup_free_refroot(fs_info, ref_root, num_bytes, + BTRFS_QGROUP_RSV_DATA); } #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index 34878699d363..171f3cce30e6 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -606,8 +606,7 @@ static int walk_down_tree(struct btrfs_root *root, struct btrfs_path *path, } /* Walk up to the next node that needs to be processed */ -static int walk_up_tree(struct btrfs_root *root, struct btrfs_path *path, - int *level) +static int walk_up_tree(struct btrfs_path *path, int *level) { int l; @@ -984,7 +983,6 @@ void btrfs_free_ref_tree_range(struct btrfs_fs_info *fs_info, u64 start, int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info) { struct btrfs_path *path; - struct btrfs_root *root; struct extent_buffer *eb; u64 bytenr = 0, num_bytes = 0; int ret, level; @@ -1014,7 +1012,7 @@ int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info) &bytenr, &num_bytes); if (ret) break; - ret = walk_up_tree(root, path, &level); + ret = walk_up_tree(path, &level); if (ret < 0) break; if (ret > 0) { diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 3338407ef0f0..aab0194efe46 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -387,13 +387,6 @@ again: WARN_ON(btrfs_root_ref_dirid(leaf, ref) != dirid); WARN_ON(btrfs_root_ref_name_len(leaf, ref) != name_len); ptr = (unsigned long)(ref + 1); - ret = btrfs_is_name_len_valid(leaf, path->slots[0], ptr, - name_len); - if (!ret) { - err = -EIO; - goto out; - } - WARN_ON(memcmp_extent_buffer(leaf, name, ptr, name_len)); *sequence = btrfs_root_ref_sequence(leaf, ref); diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 20d3300bd268..f306c608dc28 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1059,12 +1059,6 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, } } - ret = btrfs_is_name_len_valid(eb, path->slots[0], - (unsigned long)(di + 1), name_len + data_len); - if (!ret) { - ret = -EIO; - goto out; - } if (name_len + data_len > buf_len) { buf_len = name_len + data_len; if (is_vmalloc_addr(buf)) { diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 3a4dce153645..138ff32c39b9 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -67,6 +67,7 @@ static const struct super_operations btrfs_super_ops; static struct file_system_type btrfs_fs_type; +static struct file_system_type btrfs_root_fs_type; static int btrfs_remount(struct super_block *sb, int *flags, char *data); @@ -454,7 +455,8 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, case Opt_subvolrootid: case Opt_device: /* - * These are parsed by btrfs_parse_early_options + * These are parsed by btrfs_parse_subvol_options + * and btrfs_parse_early_options * and can be happily ignored here. */ break; @@ -531,9 +533,10 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, btrfs_clear_opt(info->mount_opt, NODATASUM); btrfs_set_fs_incompat(info, COMPRESS_LZO); no_compress = 0; - } else if (strcmp(args[0].from, "zstd") == 0) { + } else if (strncmp(args[0].from, "zstd", 4) == 0) { compress_type = "zstd"; info->compress_type = BTRFS_COMPRESS_ZSTD; + info->compress_level = btrfs_compress_str2level(args[0].from); btrfs_set_opt(info->mount_opt, COMPRESS); btrfs_clear_opt(info->mount_opt, NODATACOW); btrfs_clear_opt(info->mount_opt, NODATASUM); @@ -888,11 +891,63 @@ out: * only when we need to allocate a new super block. */ static int btrfs_parse_early_options(const char *options, fmode_t flags, - void *holder, char **subvol_name, u64 *subvol_objectid, - struct btrfs_fs_devices **fs_devices) + void *holder, struct btrfs_fs_devices **fs_devices) { substring_t args[MAX_OPT_ARGS]; char *device_name, *opts, *orig, *p; + int error = 0; + + if (!options) + return 0; + + /* + * strsep changes the string, duplicate it because btrfs_parse_options + * gets called later + */ + opts = kstrdup(options, GFP_KERNEL); + if (!opts) + return -ENOMEM; + orig = opts; + + while ((p = strsep(&opts, ",")) != NULL) { + int token; + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case Opt_device: + device_name = match_strdup(&args[0]); + if (!device_name) { + error = -ENOMEM; + goto out; + } + error = btrfs_scan_one_device(device_name, + flags, holder, fs_devices); + kfree(device_name); + if (error) + goto out; + break; + default: + break; + } + } + +out: + kfree(orig); + return error; +} + +/* + * Parse mount options that are related to subvolume id + * + * The value is later passed to mount_subvol() + */ +static int btrfs_parse_subvol_options(const char *options, fmode_t flags, + void *holder, char **subvol_name, u64 *subvol_objectid) +{ + substring_t args[MAX_OPT_ARGS]; + char *opts, *orig, *p; char *num = NULL; int error = 0; @@ -900,8 +955,8 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags, return 0; /* - * strsep changes the string, duplicate it because parse_options - * gets called twice + * strsep changes the string, duplicate it because + * btrfs_parse_early_options gets called later */ opts = kstrdup(options, GFP_KERNEL); if (!opts) @@ -940,18 +995,6 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags, case Opt_subvolrootid: pr_warn("BTRFS: 'subvolrootid' mount option is deprecated and has no effect\n"); break; - case Opt_device: - device_name = match_strdup(&args[0]); - if (!device_name) { - error = -ENOMEM; - goto out; - } - error = btrfs_scan_one_device(device_name, - flags, holder, fs_devices); - kfree(device_name); - if (error) - goto out; - break; default: break; } @@ -1365,86 +1408,13 @@ static inline int is_subvolume_inode(struct inode *inode) return 0; } -/* - * This will add subvolid=0 to the argument string while removing any subvol= - * and subvolid= arguments to make sure we get the top-level root for path - * walking to the subvol we want. - */ -static char *setup_root_args(char *args) -{ - char *buf, *dst, *sep; - - if (!args) - return kstrdup("subvolid=0", GFP_KERNEL); - - /* The worst case is that we add ",subvolid=0" to the end. */ - buf = dst = kmalloc(strlen(args) + strlen(",subvolid=0") + 1, - GFP_KERNEL); - if (!buf) - return NULL; - - while (1) { - sep = strchrnul(args, ','); - if (!strstarts(args, "subvol=") && - !strstarts(args, "subvolid=")) { - memcpy(dst, args, sep - args); - dst += sep - args; - *dst++ = ','; - } - if (*sep) - args = sep + 1; - else - break; - } - strcpy(dst, "subvolid=0"); - - return buf; -} - static struct dentry *mount_subvol(const char *subvol_name, u64 subvol_objectid, int flags, const char *device_name, - char *data) + char *data, struct vfsmount *mnt) { struct dentry *root; - struct vfsmount *mnt = NULL; - char *newargs; int ret; - newargs = setup_root_args(data); - if (!newargs) { - root = ERR_PTR(-ENOMEM); - goto out; - } - - mnt = vfs_kern_mount(&btrfs_fs_type, flags, device_name, newargs); - if (PTR_ERR_OR_ZERO(mnt) == -EBUSY) { - if (flags & SB_RDONLY) { - mnt = vfs_kern_mount(&btrfs_fs_type, flags & ~SB_RDONLY, - device_name, newargs); - } else { - mnt = vfs_kern_mount(&btrfs_fs_type, flags | SB_RDONLY, - device_name, newargs); - if (IS_ERR(mnt)) { - root = ERR_CAST(mnt); - mnt = NULL; - goto out; - } - - down_write(&mnt->mnt_sb->s_umount); - ret = btrfs_remount(mnt->mnt_sb, &flags, NULL); - up_write(&mnt->mnt_sb->s_umount); - if (ret < 0) { - root = ERR_PTR(ret); - goto out; - } - } - } - if (IS_ERR(mnt)) { - root = ERR_CAST(mnt); - mnt = NULL; - goto out; - } - if (!subvol_name) { if (!subvol_objectid) { ret = get_default_subvol_objectid(btrfs_sb(mnt->mnt_sb), @@ -1500,7 +1470,6 @@ static struct dentry *mount_subvol(const char *subvol_name, u64 subvol_objectid, out: mntput(mnt); - kfree(newargs); kfree(subvol_name); return root; } @@ -1558,10 +1527,10 @@ static int setup_security_options(struct btrfs_fs_info *fs_info, /* * Find a superblock for the given device / mount point. * - * Note: This is based on get_sb_bdev from fs/super.c with a few additions + * Note: This is based on mount_bdev from fs/super.c with a few additions * for multiple device setup. Make sure to keep it in sync. */ -static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, +static struct dentry *mount_root(struct file_system_type *fs_type, int flags, const char *device_name, void *data) { struct block_device *bdev = NULL; @@ -1570,27 +1539,17 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, struct btrfs_fs_info *fs_info = NULL; struct security_mnt_opts new_sec_opts; fmode_t mode = FMODE_READ; - char *subvol_name = NULL; - u64 subvol_objectid = 0; int error = 0; if (!(flags & SB_RDONLY)) mode |= FMODE_WRITE; error = btrfs_parse_early_options(data, mode, fs_type, - &subvol_name, &subvol_objectid, &fs_devices); if (error) { - kfree(subvol_name); return ERR_PTR(error); } - if (subvol_name || subvol_objectid != BTRFS_FS_TREE_OBJECTID) { - /* mount_subvol() will free subvol_name. */ - return mount_subvol(subvol_name, subvol_objectid, flags, - device_name, data); - } - security_init_mnt_opts(&new_sec_opts); if (data) { error = parse_security_options(data, &new_sec_opts); @@ -1608,7 +1567,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, * it for searching for existing supers, so this lets us do that and * then open_ctree will properly initialize everything later. */ - fs_info = kzalloc(sizeof(struct btrfs_fs_info), GFP_KERNEL); + fs_info = kzalloc(sizeof(struct btrfs_fs_info), GFP_NOFS); if (!fs_info) { error = -ENOMEM; goto error_sec_opts; @@ -1616,8 +1575,8 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, fs_info->fs_devices = fs_devices; - fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL); - fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL); + fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS); + fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS); security_init_mnt_opts(&fs_info->security_opts); if (!fs_info->super_copy || !fs_info->super_for_commit) { error = -ENOMEM; @@ -1674,6 +1633,84 @@ error_sec_opts: return ERR_PTR(error); } +/* + * Mount function which is called by VFS layer. + * + * In order to allow mounting a subvolume directly, btrfs uses + * mount_subtree() which needs vfsmount* of device's root (/). + * This means device's root has to be mounted internally in any case. + * + * Operation flow: + * 1. Parse subvol id related options for later use in mount_subvol(). + * + * 2. Mount device's root (/) by calling vfs_kern_mount(). + * + * NOTE: vfs_kern_mount() is used by VFS to call btrfs_mount() in the + * first place. In order to avoid calling btrfs_mount() again, we use + * different file_system_type which is not registered to VFS by + * register_filesystem(). As a result, mount_root() is called. + * The return value will be used by mount_subtree() in mount_subvol(). + * + * 3. Call mount_subvol() to get the dentry of subvolume. Since there is + * "btrfs subvolume set-default", mount_subvol() is called always. + */ +static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, + const char *device_name, void *data) +{ + struct vfsmount *mnt_root; + struct dentry *root; + fmode_t mode = FMODE_READ; + char *subvol_name = NULL; + u64 subvol_objectid = 0; + int error = 0; + + if (!(flags & SB_RDONLY)) + mode |= FMODE_WRITE; + + error = btrfs_parse_subvol_options(data, mode, fs_type, + &subvol_name, &subvol_objectid); + if (error) { + kfree(subvol_name); + return ERR_PTR(error); + } + + /* mount device's root (/) */ + mnt_root = vfs_kern_mount(&btrfs_root_fs_type, flags, device_name, data); + if (PTR_ERR_OR_ZERO(mnt_root) == -EBUSY) { + if (flags & SB_RDONLY) { + mnt_root = vfs_kern_mount(&btrfs_root_fs_type, flags & ~SB_RDONLY, + device_name, data); + } else { + mnt_root = vfs_kern_mount(&btrfs_root_fs_type, flags | SB_RDONLY, + device_name, data); + if (IS_ERR(mnt_root)) { + root = ERR_CAST(mnt_root); + goto out; + } + + down_write(&mnt_root->mnt_sb->s_umount); + error = btrfs_remount(mnt_root->mnt_sb, &flags, NULL); + up_write(&mnt_root->mnt_sb->s_umount); + if (error < 0) { + root = ERR_PTR(error); + mntput(mnt_root); + goto out; + } + } + } + if (IS_ERR(mnt_root)) { + root = ERR_CAST(mnt_root); + goto out; + } + + /* mount_subvol() will free subvol_name and mnt_root */ + root = mount_subvol(subvol_name, subvol_objectid, flags, + device_name, data, mnt_root); + +out: + return root; +} + static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info, int new_pool_size, int old_pool_size) { @@ -2174,6 +2211,15 @@ static struct file_system_type btrfs_fs_type = { .kill_sb = btrfs_kill_super, .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA, }; + +static struct file_system_type btrfs_root_fs_type = { + .owner = THIS_MODULE, + .name = "btrfs", + .mount = mount_root, + .kill_sb = btrfs_kill_super, + .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA, +}; + MODULE_ALIAS_FS("btrfs"); static int btrfs_control_open(struct inode *inode, struct file *file) @@ -2324,7 +2370,7 @@ static struct miscdevice btrfs_misc = { MODULE_ALIAS_MISCDEV(BTRFS_MINOR); MODULE_ALIAS("devname:btrfs-control"); -static int btrfs_interface_init(void) +static int __init btrfs_interface_init(void) { return misc_register(&btrfs_misc); } @@ -2334,7 +2380,7 @@ static void btrfs_interface_exit(void) misc_deregister(&btrfs_misc); } -static void btrfs_print_mod_info(void) +static void __init btrfs_print_mod_info(void) { pr_info("Btrfs loaded, crc32c=%s" #ifdef CONFIG_BTRFS_DEBUG diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index a28bba801264..a8bafed931f4 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -897,7 +897,7 @@ static int btrfs_init_debugfs(void) return 0; } -int btrfs_init_sysfs(void) +int __init btrfs_init_sysfs(void) { int ret; diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index 30affb60da51..13420cd19ef0 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -288,10 +288,6 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) test_msg("Expected a hole, got %llu\n", em->block_start); goto out; } - if (!test_bit(EXTENT_FLAG_VACANCY, &em->flags)) { - test_msg("Vacancy flag wasn't set properly\n"); - goto out; - } free_extent_map(em); btrfs_drop_extent_cache(BTRFS_I(inode), 0, (u64)-1, 0); @@ -1001,8 +997,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) BTRFS_MAX_EXTENT_SIZE >> 1, (BTRFS_MAX_EXTENT_SIZE >> 1) + sectorsize - 1, EXTENT_DELALLOC | EXTENT_DIRTY | - EXTENT_UPTODATE, 0, 0, - NULL, GFP_KERNEL); + EXTENT_UPTODATE, 0, 0, NULL); if (ret) { test_msg("clear_extent_bit returned %d\n", ret); goto out; @@ -1070,8 +1065,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) BTRFS_MAX_EXTENT_SIZE + sectorsize, BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_UPTODATE, 0, 0, - NULL, GFP_KERNEL); + EXTENT_UPTODATE, 0, 0, NULL); if (ret) { test_msg("clear_extent_bit returned %d\n", ret); goto out; @@ -1104,8 +1098,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) /* Empty */ ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1, EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_UPTODATE, 0, 0, - NULL, GFP_KERNEL); + EXTENT_UPTODATE, 0, 0, NULL); if (ret) { test_msg("clear_extent_bit returned %d\n", ret); goto out; @@ -1121,8 +1114,7 @@ out: if (ret) clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1, EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_UPTODATE, 0, 0, - NULL, GFP_KERNEL); + EXTENT_UPTODATE, 0, 0, NULL); iput(inode); btrfs_free_dummy_root(root); btrfs_free_dummy_fs_info(fs_info); @@ -1134,7 +1126,6 @@ int btrfs_test_inodes(u32 sectorsize, u32 nodesize) int ret; set_bit(EXTENT_FLAG_COMPRESSED, &compressed_only); - set_bit(EXTENT_FLAG_VACANCY, &vacancy_only); set_bit(EXTENT_FLAG_PREALLOC, &prealloc_only); test_msg("Running btrfs_get_extent tests\n"); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 5a8c2649af2f..6348573e26a7 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -495,8 +495,8 @@ start_transaction(struct btrfs_root *root, unsigned int num_items, if (current->journal_info) { WARN_ON(type & TRANS_EXTWRITERS); h = current->journal_info; - h->use_count++; - WARN_ON(h->use_count > 2); + refcount_inc(&h->use_count); + WARN_ON(refcount_read(&h->use_count) > 2); h->orig_rsv = h->block_rsv; h->block_rsv = NULL; goto got_it; @@ -567,7 +567,7 @@ again: h->transid = cur_trans->transid; h->transaction = cur_trans; h->root = root; - h->use_count = 1; + refcount_set(&h->use_count, 1); h->fs_info = root->fs_info; h->type = type; @@ -837,8 +837,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, int err = 0; int must_run_delayed_refs = 0; - if (trans->use_count > 1) { - trans->use_count--; + if (refcount_read(&trans->use_count) > 1) { + refcount_dec(&trans->use_count); trans->block_rsv = trans->orig_rsv; return 0; } @@ -1016,8 +1016,7 @@ static int __btrfs_wait_marked_extents(struct btrfs_fs_info *fs_info, * it's safe to do it (through clear_btree_io_tree()). */ err = clear_extent_bit(dirty_pages, start, end, - EXTENT_NEED_WAIT, - 0, 0, &cached_state, GFP_NOFS); + EXTENT_NEED_WAIT, 0, 0, &cached_state); if (err == -ENOMEM) err = 0; if (!err) @@ -1869,7 +1868,7 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, struct btrfs_transaction *cur_trans = trans->transaction; DEFINE_WAIT(wait); - WARN_ON(trans->use_count > 1); + WARN_ON(refcount_read(&trans->use_count) > 1); btrfs_abort_transaction(trans, err); diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index c55e44560103..6beee072b1bd 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -58,6 +58,7 @@ struct btrfs_transaction { /* Be protected by fs_info->trans_lock when we want to change it. */ enum btrfs_trans_state state; + int aborted; struct list_head list; struct extent_io_tree dirty_pages; unsigned long start_time; @@ -70,7 +71,6 @@ struct btrfs_transaction { struct list_head dirty_bgs; struct list_head io_bgs; struct list_head dropped_roots; - u64 num_dirty_bgs; /* * we need to make sure block group deletion doesn't race with @@ -79,11 +79,11 @@ struct btrfs_transaction { */ struct mutex cache_write_mutex; spinlock_t dirty_bgs_lock; + unsigned int num_dirty_bgs; /* Protected by spin lock fs_info->unused_bgs_lock. */ struct list_head deleted_bgs; spinlock_t dropped_roots_lock; struct btrfs_delayed_ref_root delayed_refs; - int aborted; struct btrfs_fs_info *fs_info; }; @@ -111,20 +111,19 @@ struct btrfs_trans_handle { u64 transid; u64 bytes_reserved; u64 chunk_bytes_reserved; - unsigned long use_count; - unsigned long blocks_reserved; unsigned long delayed_ref_updates; struct btrfs_transaction *transaction; struct btrfs_block_rsv *block_rsv; struct btrfs_block_rsv *orig_rsv; + refcount_t use_count; + unsigned int type; short aborted; - short adding_csums; + bool adding_csums; bool allocating_chunk; bool can_flush_pending_bgs; bool reloc_reserved; bool sync; bool dirty; - unsigned int type; struct btrfs_root *root; struct btrfs_fs_info *fs_info; struct list_head new_bgs; diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index ce4ed6ec8f39..66dac0a4b01f 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -30,6 +30,7 @@ #include "tree-checker.h" #include "disk-io.h" #include "compression.h" +#include "hash.h" /* * Error message should follow the following format: @@ -223,6 +224,141 @@ static int check_csum_item(struct btrfs_root *root, struct extent_buffer *leaf, } /* + * Customized reported for dir_item, only important new info is key->objectid, + * which represents inode number + */ +__printf(4, 5) +static void dir_item_err(const struct btrfs_root *root, + const struct extent_buffer *eb, int slot, + const char *fmt, ...) +{ + struct btrfs_key key; + struct va_format vaf; + va_list args; + + btrfs_item_key_to_cpu(eb, &key, slot); + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + + btrfs_crit(root->fs_info, + "corrupt %s: root=%llu block=%llu slot=%d ino=%llu, %pV", + btrfs_header_level(eb) == 0 ? "leaf" : "node", root->objectid, + btrfs_header_bytenr(eb), slot, key.objectid, &vaf); + va_end(args); +} + +static int check_dir_item(struct btrfs_root *root, + struct extent_buffer *leaf, + struct btrfs_key *key, int slot) +{ + struct btrfs_dir_item *di; + u32 item_size = btrfs_item_size_nr(leaf, slot); + u32 cur = 0; + + di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); + while (cur < item_size) { + char namebuf[max(BTRFS_NAME_LEN, XATTR_NAME_MAX)]; + u32 name_len; + u32 data_len; + u32 max_name_len; + u32 total_size; + u32 name_hash; + u8 dir_type; + + /* header itself should not cross item boundary */ + if (cur + sizeof(*di) > item_size) { + dir_item_err(root, leaf, slot, + "dir item header crosses item boundary, have %lu boundary %u", + cur + sizeof(*di), item_size); + return -EUCLEAN; + } + + /* dir type check */ + dir_type = btrfs_dir_type(leaf, di); + if (dir_type >= BTRFS_FT_MAX) { + dir_item_err(root, leaf, slot, + "invalid dir item type, have %u expect [0, %u)", + dir_type, BTRFS_FT_MAX); + return -EUCLEAN; + } + + if (key->type == BTRFS_XATTR_ITEM_KEY && + dir_type != BTRFS_FT_XATTR) { + dir_item_err(root, leaf, slot, + "invalid dir item type for XATTR key, have %u expect %u", + dir_type, BTRFS_FT_XATTR); + return -EUCLEAN; + } + if (dir_type == BTRFS_FT_XATTR && + key->type != BTRFS_XATTR_ITEM_KEY) { + dir_item_err(root, leaf, slot, + "xattr dir type found for non-XATTR key"); + return -EUCLEAN; + } + if (dir_type == BTRFS_FT_XATTR) + max_name_len = XATTR_NAME_MAX; + else + max_name_len = BTRFS_NAME_LEN; + + /* Name/data length check */ + name_len = btrfs_dir_name_len(leaf, di); + data_len = btrfs_dir_data_len(leaf, di); + if (name_len > max_name_len) { + dir_item_err(root, leaf, slot, + "dir item name len too long, have %u max %u", + name_len, max_name_len); + return -EUCLEAN; + } + if (name_len + data_len > BTRFS_MAX_XATTR_SIZE(root->fs_info)) { + dir_item_err(root, leaf, slot, + "dir item name and data len too long, have %u max %u", + name_len + data_len, + BTRFS_MAX_XATTR_SIZE(root->fs_info)); + return -EUCLEAN; + } + + if (data_len && dir_type != BTRFS_FT_XATTR) { + dir_item_err(root, leaf, slot, + "dir item with invalid data len, have %u expect 0", + data_len); + return -EUCLEAN; + } + + total_size = sizeof(*di) + name_len + data_len; + + /* header and name/data should not cross item boundary */ + if (cur + total_size > item_size) { + dir_item_err(root, leaf, slot, + "dir item data crosses item boundary, have %u boundary %u", + cur + total_size, item_size); + return -EUCLEAN; + } + + /* + * Special check for XATTR/DIR_ITEM, as key->offset is name + * hash, should match its name + */ + if (key->type == BTRFS_DIR_ITEM_KEY || + key->type == BTRFS_XATTR_ITEM_KEY) { + read_extent_buffer(leaf, namebuf, + (unsigned long)(di + 1), name_len); + name_hash = btrfs_name_hash(namebuf, name_len); + if (key->offset != name_hash) { + dir_item_err(root, leaf, slot, + "name hash mismatch with key, have 0x%016x expect 0x%016llx", + name_hash, key->offset); + return -EUCLEAN; + } + } + cur += total_size; + di = (struct btrfs_dir_item *)((void *)di + total_size); + } + return 0; +} + +/* * Common point to switch the item-specific validation. */ static int check_leaf_item(struct btrfs_root *root, @@ -238,6 +374,11 @@ static int check_leaf_item(struct btrfs_root *root, case BTRFS_EXTENT_CSUM_KEY: ret = check_csum_item(root, leaf, key, slot); break; + case BTRFS_DIR_ITEM_KEY: + case BTRFS_DIR_INDEX_KEY: + case BTRFS_XATTR_ITEM_KEY: + ret = check_dir_item(root, leaf, key, slot); + break; } return ret; } diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 7bf9b31561db..ee1aaed1330e 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1173,19 +1173,15 @@ next: return 0; } -static int extref_get_fields(struct extent_buffer *eb, int slot, - unsigned long ref_ptr, u32 *namelen, char **name, - u64 *index, u64 *parent_objectid) +static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, + u32 *namelen, char **name, u64 *index, + u64 *parent_objectid) { struct btrfs_inode_extref *extref; extref = (struct btrfs_inode_extref *)ref_ptr; *namelen = btrfs_inode_extref_name_len(eb, extref); - if (!btrfs_is_name_len_valid(eb, slot, (unsigned long)&extref->name, - *namelen)) - return -EIO; - *name = kmalloc(*namelen, GFP_NOFS); if (*name == NULL) return -ENOMEM; @@ -1200,19 +1196,14 @@ static int extref_get_fields(struct extent_buffer *eb, int slot, return 0; } -static int ref_get_fields(struct extent_buffer *eb, int slot, - unsigned long ref_ptr, u32 *namelen, char **name, - u64 *index) +static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, + u32 *namelen, char **name, u64 *index) { struct btrfs_inode_ref *ref; ref = (struct btrfs_inode_ref *)ref_ptr; *namelen = btrfs_inode_ref_name_len(eb, ref); - if (!btrfs_is_name_len_valid(eb, slot, (unsigned long)(ref + 1), - *namelen)) - return -EIO; - *name = kmalloc(*namelen, GFP_NOFS); if (*name == NULL) return -ENOMEM; @@ -1287,8 +1278,8 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, while (ref_ptr < ref_end) { if (log_ref_ver) { - ret = extref_get_fields(eb, slot, ref_ptr, &namelen, - &name, &ref_index, &parent_objectid); + ret = extref_get_fields(eb, ref_ptr, &namelen, &name, + &ref_index, &parent_objectid); /* * parent object can change from one array * item to another. @@ -1300,8 +1291,8 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, goto out; } } else { - ret = ref_get_fields(eb, slot, ref_ptr, &namelen, - &name, &ref_index); + ret = ref_get_fields(eb, ref_ptr, &namelen, &name, + &ref_index); } if (ret) goto out; @@ -1835,7 +1826,6 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, struct extent_buffer *eb, int slot, struct btrfs_key *key) { - struct btrfs_fs_info *fs_info = root->fs_info; int ret = 0; u32 item_size = btrfs_item_size_nr(eb, slot); struct btrfs_dir_item *di; @@ -1848,8 +1838,6 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, ptr_end = ptr + item_size; while (ptr < ptr_end) { di = (struct btrfs_dir_item *)ptr; - if (verify_dir_item(fs_info, eb, slot, di)) - return -EIO; name_len = btrfs_dir_name_len(eb, di); ret = replay_one_name(trans, root, path, eb, di, key); if (ret < 0) @@ -2024,11 +2012,6 @@ again: ptr_end = ptr + item_size; while (ptr < ptr_end) { di = (struct btrfs_dir_item *)ptr; - if (verify_dir_item(fs_info, eb, slot, di)) { - ret = -EIO; - goto out; - } - name_len = btrfs_dir_name_len(eb, di); name = kmalloc(name_len, GFP_NOFS); if (!name) { @@ -2109,7 +2092,6 @@ static int replay_xattr_deletes(struct btrfs_trans_handle *trans, struct btrfs_path *path, const u64 ino) { - struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_key search_key; struct btrfs_path *log_path; int i; @@ -2151,11 +2133,6 @@ process_leaf: u32 this_len = sizeof(*di) + name_len + data_len; char *name; - ret = verify_dir_item(fs_info, path->nodes[0], i, di); - if (ret) { - ret = -EIO; - goto out; - } name = kmalloc(name_len, GFP_NOFS); if (!name) { ret = -ENOMEM; @@ -4572,12 +4549,6 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb, this_len = sizeof(*extref) + this_name_len; } - ret = btrfs_is_name_len_valid(eb, slot, name_ptr, - this_name_len); - if (!ret) { - ret = -EIO; - goto out; - } if (this_name_len > name_len) { char *new_name; @@ -5432,11 +5403,10 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, struct dentry *parent, const loff_t start, const loff_t end, - int exists_only, + int inode_only, struct btrfs_log_ctx *ctx) { struct btrfs_fs_info *fs_info = root->fs_info; - int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL; struct super_block *sb; struct dentry *old_parent = NULL; int ret = 0; @@ -5602,7 +5572,7 @@ int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, int ret; ret = btrfs_log_inode_parent(trans, root, BTRFS_I(d_inode(dentry)), - parent, start, end, 0, ctx); + parent, start, end, LOG_INODE_ALL, ctx); dput(parent); return ret; @@ -5865,6 +5835,6 @@ int btrfs_log_new_name(struct btrfs_trans_handle *trans, return 0; return btrfs_log_inode_parent(trans, root, inode, parent, 0, - LLONG_MAX, 1, NULL); + LLONG_MAX, LOG_INODE_EXISTS, NULL); } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 49810b70afd3..a999848e745f 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -145,6 +145,71 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, struct btrfs_bio **bbio_ret, int mirror_num, int need_raid_map); +/* + * Device locking + * ============== + * + * There are several mutexes that protect manipulation of devices and low-level + * structures like chunks but not block groups, extents or files + * + * uuid_mutex (global lock) + * ------------------------ + * protects the fs_uuids list that tracks all per-fs fs_devices, resulting from + * the SCAN_DEV ioctl registration or from mount either implicitly (the first + * device) or requested by the device= mount option + * + * the mutex can be very coarse and can cover long-running operations + * + * protects: updates to fs_devices counters like missing devices, rw devices, + * seeding, structure cloning, openning/closing devices at mount/umount time + * + * global::fs_devs - add, remove, updates to the global list + * + * does not protect: manipulation of the fs_devices::devices list! + * + * btrfs_device::name - renames (write side), read is RCU + * + * fs_devices::device_list_mutex (per-fs, with RCU) + * ------------------------------------------------ + * protects updates to fs_devices::devices, ie. adding and deleting + * + * simple list traversal with read-only actions can be done with RCU protection + * + * may be used to exclude some operations from running concurrently without any + * modifications to the list (see write_all_supers) + * + * volume_mutex + * ------------ + * coarse lock owned by a mounted filesystem; used to exclude some operations + * that cannot run in parallel and affect the higher-level properties of the + * filesystem like: device add/deleting/resize/replace, or balance + * + * balance_mutex + * ------------- + * protects balance structures (status, state) and context accessed from + * several places (internally, ioctl) + * + * chunk_mutex + * ----------- + * protects chunks, adding or removing during allocation, trim or when a new + * device is added/removed + * + * cleaner_mutex + * ------------- + * a big lock that is held by the cleaner thread and prevents running subvolume + * cleaning together with relocation or delayed iputs + * + * + * Lock nesting + * ============ + * + * uuid_mutex + * volume_mutex + * device_list_mutex + * chunk_mutex + * balance_mutex + */ + DEFINE_MUTEX(uuid_mutex); static LIST_HEAD(fs_uuids); struct list_head *btrfs_get_fs_uuids(void) @@ -180,6 +245,13 @@ static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid) return fs_devs; } +static void free_device(struct btrfs_device *device) +{ + rcu_string_free(device->name); + bio_put(device->flush_bio); + kfree(device); +} + static void free_fs_devices(struct btrfs_fs_devices *fs_devices) { struct btrfs_device *device; @@ -188,9 +260,7 @@ static void free_fs_devices(struct btrfs_fs_devices *fs_devices) device = list_entry(fs_devices->devices.next, struct btrfs_device, dev_list); list_del(&device->dev_list); - rcu_string_free(device->name); - bio_put(device->flush_bio); - kfree(device); + free_device(device); } kfree(fs_devices); } @@ -220,6 +290,11 @@ void btrfs_cleanup_fs_uuids(void) } } +/* + * Returns a pointer to a new btrfs_device on success; ERR_PTR() on error. + * Returned struct is not linked onto any lists and must be destroyed using + * free_device. + */ static struct btrfs_device *__alloc_device(void) { struct btrfs_device *dev; @@ -578,15 +653,77 @@ static void btrfs_free_stale_device(struct btrfs_device *cur_dev) } else { fs_devs->num_devices--; list_del(&dev->dev_list); - rcu_string_free(dev->name); - bio_put(dev->flush_bio); - kfree(dev); + free_device(dev); } break; } } } +static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, + struct btrfs_device *device, fmode_t flags, + void *holder) +{ + struct request_queue *q; + struct block_device *bdev; + struct buffer_head *bh; + struct btrfs_super_block *disk_super; + u64 devid; + int ret; + + if (device->bdev) + return -EINVAL; + if (!device->name) + return -EINVAL; + + ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1, + &bdev, &bh); + if (ret) + return ret; + + disk_super = (struct btrfs_super_block *)bh->b_data; + devid = btrfs_stack_device_id(&disk_super->dev_item); + if (devid != device->devid) + goto error_brelse; + + if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE)) + goto error_brelse; + + device->generation = btrfs_super_generation(disk_super); + + if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) { + device->writeable = 0; + fs_devices->seeding = 1; + } else { + device->writeable = !bdev_read_only(bdev); + } + + q = bdev_get_queue(bdev); + if (blk_queue_discard(q)) + device->can_discard = 1; + if (!blk_queue_nonrot(q)) + fs_devices->rotating = 1; + + device->bdev = bdev; + device->in_fs_metadata = 0; + device->mode = flags; + + fs_devices->open_devices++; + if (device->writeable && device->devid != BTRFS_DEV_REPLACE_DEVID) { + fs_devices->rw_devices++; + list_add(&device->dev_alloc_list, &fs_devices->alloc_list); + } + brelse(bh); + + return 0; + +error_brelse: + brelse(bh); + blkdev_put(bdev, flags); + + return -EINVAL; +} + /* * Add new device to list of registered devices * @@ -632,8 +769,7 @@ static noinline int device_list_add(const char *path, name = rcu_string_strdup(path, GFP_NOFS); if (!name) { - bio_put(device->flush_bio); - kfree(device); + free_device(device); return -ENOMEM; } rcu_assign_pointer(device->name, name); @@ -745,8 +881,7 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) name = rcu_string_strdup(orig_dev->name->str, GFP_KERNEL); if (!name) { - bio_put(device->flush_bio); - kfree(device); + free_device(device); goto error; } rcu_assign_pointer(device->name, name); @@ -810,9 +945,7 @@ again: } list_del_init(&device->dev_list); fs_devices->num_devices--; - rcu_string_free(device->name); - bio_put(device->flush_bio); - kfree(device); + free_device(device); } if (fs_devices->seed) { @@ -825,35 +958,25 @@ again: mutex_unlock(&uuid_mutex); } -static void __free_device(struct work_struct *work) -{ - struct btrfs_device *device; - - device = container_of(work, struct btrfs_device, rcu_work); - rcu_string_free(device->name); - bio_put(device->flush_bio); - kfree(device); -} - -static void free_device(struct rcu_head *head) +static void free_device_rcu(struct rcu_head *head) { struct btrfs_device *device; device = container_of(head, struct btrfs_device, rcu); - - INIT_WORK(&device->rcu_work, __free_device); - schedule_work(&device->rcu_work); + free_device(device); } static void btrfs_close_bdev(struct btrfs_device *device) { - if (device->bdev && device->writeable) { + if (!device->bdev) + return; + + if (device->writeable) { sync_blockdev(device->bdev); invalidate_bdev(device->bdev); } - if (device->bdev) - blkdev_put(device->bdev, device->mode); + blkdev_put(device->bdev, device->mode); } static void btrfs_prepare_close_one_device(struct btrfs_device *device) @@ -917,7 +1040,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) struct btrfs_device, dev_list); list_del(&device->dev_list); btrfs_close_bdev(device); - call_rcu(&device->rcu, free_device); + call_rcu(&device->rcu, free_device_rcu); } WARN_ON(fs_devices->open_devices); @@ -947,93 +1070,33 @@ int btrfs_close_devices(struct btrfs_fs_devices *fs_devices) __btrfs_close_devices(fs_devices); free_fs_devices(fs_devices); } - /* - * Wait for rcu kworkers under __btrfs_close_devices - * to finish all blkdev_puts so device is really - * free when umount is done. - */ - rcu_barrier(); return ret; } static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, fmode_t flags, void *holder) { - struct request_queue *q; - struct block_device *bdev; struct list_head *head = &fs_devices->devices; struct btrfs_device *device; struct btrfs_device *latest_dev = NULL; - struct buffer_head *bh; - struct btrfs_super_block *disk_super; - u64 devid; - int seeding = 1; int ret = 0; flags |= FMODE_EXCL; list_for_each_entry(device, head, dev_list) { - if (device->bdev) - continue; - if (!device->name) - continue; - /* Just open everything we can; ignore failures here */ - if (btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1, - &bdev, &bh)) + ret = btrfs_open_one_device(fs_devices, device, flags, holder); + if (ret) continue; - disk_super = (struct btrfs_super_block *)bh->b_data; - devid = btrfs_stack_device_id(&disk_super->dev_item); - if (devid != device->devid) - goto error_brelse; - - if (memcmp(device->uuid, disk_super->dev_item.uuid, - BTRFS_UUID_SIZE)) - goto error_brelse; - - device->generation = btrfs_super_generation(disk_super); if (!latest_dev || device->generation > latest_dev->generation) latest_dev = device; - - if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) { - device->writeable = 0; - } else { - device->writeable = !bdev_read_only(bdev); - seeding = 0; - } - - q = bdev_get_queue(bdev); - if (blk_queue_discard(q)) - device->can_discard = 1; - if (!blk_queue_nonrot(q)) - fs_devices->rotating = 1; - - device->bdev = bdev; - device->in_fs_metadata = 0; - device->mode = flags; - - fs_devices->open_devices++; - if (device->writeable && - device->devid != BTRFS_DEV_REPLACE_DEVID) { - fs_devices->rw_devices++; - list_add(&device->dev_alloc_list, - &fs_devices->alloc_list); - } - brelse(bh); - continue; - -error_brelse: - brelse(bh); - blkdev_put(bdev, flags); - continue; } if (fs_devices->open_devices == 0) { ret = -EINVAL; goto out; } - fs_devices->seeding = seeding; fs_devices->opened = 1; fs_devices->latest_bdev = latest_dev->bdev; fs_devices->total_rw_bytes = 0; @@ -1662,7 +1725,7 @@ error: * the device information is stored in the chunk root * the btrfs_device struct should be fully filled in */ -static int btrfs_add_device(struct btrfs_trans_handle *trans, +static int btrfs_add_dev_item(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, struct btrfs_device *device) { @@ -1859,6 +1922,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, u64 num_devices; int ret = 0; + mutex_lock(&fs_info->volume_mutex); mutex_lock(&uuid_mutex); num_devices = fs_info->fs_devices->num_devices; @@ -1954,7 +2018,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, btrfs_scratch_superblocks(device->bdev, device->name->str); btrfs_close_bdev(device); - call_rcu(&device->rcu, free_device); + call_rcu(&device->rcu, free_device_rcu); if (cur_devices->open_devices == 0) { struct btrfs_fs_devices *fs_devices; @@ -1973,6 +2037,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, out: mutex_unlock(&uuid_mutex); + mutex_unlock(&fs_info->volume_mutex); return ret; error_undo: @@ -2025,7 +2090,7 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info, } btrfs_close_bdev(srcdev); - call_rcu(&srcdev->rcu, free_device); + call_rcu(&srcdev->rcu, free_device_rcu); /* if this is no devs we rather delete the fs_devices */ if (!fs_devices->num_devices) { @@ -2084,7 +2149,7 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str); btrfs_close_bdev(tgtdev); - call_rcu(&tgtdev->rcu, free_device); + call_rcu(&tgtdev->rcu, free_device_rcu); } static int btrfs_find_device_by_path(struct btrfs_fs_info *fs_info, @@ -2358,20 +2423,15 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path name = rcu_string_strdup(device_path, GFP_KERNEL); if (!name) { - bio_put(device->flush_bio); - kfree(device); ret = -ENOMEM; - goto error; + goto error_free_device; } rcu_assign_pointer(device->name, name); trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { - rcu_string_free(device->name); - bio_put(device->flush_bio); - kfree(device); ret = PTR_ERR(trans); - goto error; + goto error_free_device; } q = bdev_get_queue(bdev); @@ -2450,7 +2510,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path } } - ret = btrfs_add_device(trans, fs_info, device); + ret = btrfs_add_dev_item(trans, fs_info, device); if (ret) { btrfs_abort_transaction(trans, ret); goto error_sysfs; @@ -2511,9 +2571,8 @@ error_trans: sb->s_flags |= SB_RDONLY; if (trans) btrfs_end_transaction(trans); - rcu_string_free(device->name); - bio_put(device->flush_bio); - kfree(device); +error_free_device: + free_device(device); error: blkdev_put(bdev, FMODE_EXCL); if (seeding_dev && !unlocked) { @@ -2579,8 +2638,7 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, name = rcu_string_strdup(device_path, GFP_KERNEL); if (!name) { - bio_put(device->flush_bio); - kfree(device); + free_device(device); ret = -ENOMEM; goto error; } @@ -6004,15 +6062,14 @@ static void btrfs_end_bio(struct bio *bio) dev = bbio->stripes[stripe_index].dev; if (dev->bdev) { if (bio_op(bio) == REQ_OP_WRITE) - btrfs_dev_stat_inc(dev, + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); else - btrfs_dev_stat_inc(dev, + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); if (bio->bi_opf & REQ_PREFLUSH) - btrfs_dev_stat_inc(dev, + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_FLUSH_ERRS); - btrfs_dev_stat_print_on_error(dev); } } } @@ -6273,8 +6330,8 @@ static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices, * is generated. * * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR() - * on error. Returned struct is not linked onto any lists and can be - * destroyed with kfree() right away. + * on error. Returned struct is not linked onto any lists and must be + * destroyed with free_device. */ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, const u64 *devid, @@ -6297,8 +6354,7 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, ret = find_next_devid(fs_info, &tmp); if (ret) { - bio_put(dev->flush_bio); - kfree(dev); + free_device(dev); return ERR_PTR(ret); } } @@ -7092,10 +7148,24 @@ int btrfs_run_dev_stats(struct btrfs_trans_handle *trans, mutex_lock(&fs_devices->device_list_mutex); list_for_each_entry(device, &fs_devices->devices, dev_list) { - if (!device->dev_stats_valid || !btrfs_dev_stats_dirty(device)) + stats_cnt = atomic_read(&device->dev_stats_ccnt); + if (!device->dev_stats_valid || stats_cnt == 0) continue; - stats_cnt = atomic_read(&device->dev_stats_ccnt); + + /* + * There is a LOAD-LOAD control dependency between the value of + * dev_stats_ccnt and updating the on-disk values which requires + * reading the in-memory counters. Such control dependencies + * require explicit read memory barriers. + * + * This memory barriers pairs with smp_mb__before_atomic in + * btrfs_dev_stat_inc/btrfs_dev_stat_set and with the full + * barrier implied by atomic_xchg in + * btrfs_dev_stats_read_and_reset + */ + smp_rmb(); + ret = update_dev_stat_item(trans, fs_info, device); if (!ret) atomic_sub(stats_cnt, &device->dev_stats_ccnt); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index ff15208344a7..294c4eb6a272 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -133,7 +133,6 @@ struct btrfs_device { struct btrfs_work work; struct rcu_head rcu; - struct work_struct rcu_work; /* readahead state */ spinlock_t reada_lock; @@ -489,15 +488,16 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, int btrfs_remove_chunk(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 chunk_offset); -static inline int btrfs_dev_stats_dirty(struct btrfs_device *dev) -{ - return atomic_read(&dev->dev_stats_ccnt); -} - static inline void btrfs_dev_stat_inc(struct btrfs_device *dev, int index) { atomic_inc(dev->dev_stat_values + index); + /* + * This memory barrier orders stores updating statistics before stores + * updating dev_stats_ccnt. + * + * It pairs with smp_rmb() in btrfs_run_dev_stats(). + */ smp_mb__before_atomic(); atomic_inc(&dev->dev_stats_ccnt); } @@ -514,7 +514,13 @@ static inline int btrfs_dev_stat_read_and_reset(struct btrfs_device *dev, int ret; ret = atomic_xchg(dev->dev_stat_values + index, 0); - smp_mb__before_atomic(); + /* + * atomic_xchg implies a full memory barriers as per atomic_t.txt: + * - RMW operations that have a return value are fully ordered; + * + * This implicit memory barriers is paired with the smp_rmb in + * btrfs_run_dev_stats + */ atomic_inc(&dev->dev_stats_ccnt); return ret; } @@ -523,6 +529,12 @@ static inline void btrfs_dev_stat_set(struct btrfs_device *dev, int index, unsigned long val) { atomic_set(dev->dev_stat_values + index, val); + /* + * This memory barrier orders stores updating statistics before stores + * updating dev_stats_ccnt. + * + * It pairs with smp_rmb() in btrfs_run_dev_stats(). + */ smp_mb__before_atomic(); atomic_inc(&dev->dev_stats_ccnt); } diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 2c7e53f9ff1b..ad298c248da4 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -267,7 +267,6 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size) { struct btrfs_key key; struct inode *inode = d_inode(dentry); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_path *path; int ret = 0; @@ -336,11 +335,6 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size) u32 this_len = sizeof(*di) + name_len + data_len; unsigned long name_ptr = (unsigned long)(di + 1); - if (verify_dir_item(fs_info, leaf, slot, di)) { - ret = -EIO; - goto err; - } - total_size += name_len + 1; /* * We are just looking for how big our buffer needs to diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index 17f2dd8fddb8..eb3e39012658 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -26,11 +26,14 @@ #define ZSTD_BTRFS_MAX_WINDOWLOG 17 #define ZSTD_BTRFS_MAX_INPUT (1 << ZSTD_BTRFS_MAX_WINDOWLOG) #define ZSTD_BTRFS_DEFAULT_LEVEL 3 +// Max supported by the algorithm is 22, but gains for small blocks (128KB) +// are limited, thus we cap at 15. +#define ZSTD_BTRFS_MAX_LEVEL 15 -static ZSTD_parameters zstd_get_btrfs_parameters(size_t src_len) +static ZSTD_parameters zstd_get_btrfs_parameters(size_t src_len, int level) { - ZSTD_parameters params = ZSTD_getParams(ZSTD_BTRFS_DEFAULT_LEVEL, - src_len, 0); + ZSTD_parameters params = ZSTD_getParams(level, src_len, 0); + BUG_ON(level > ZSTD_maxCLevel()); if (params.cParams.windowLog > ZSTD_BTRFS_MAX_WINDOWLOG) params.cParams.windowLog = ZSTD_BTRFS_MAX_WINDOWLOG; @@ -43,6 +46,9 @@ struct workspace { size_t size; char *buf; struct list_head list; + ZSTD_inBuffer in_buf; + ZSTD_outBuffer out_buf; + int level; }; static void zstd_free_workspace(struct list_head *ws) @@ -57,7 +63,8 @@ static void zstd_free_workspace(struct list_head *ws) static struct list_head *zstd_alloc_workspace(void) { ZSTD_parameters params = - zstd_get_btrfs_parameters(ZSTD_BTRFS_MAX_INPUT); + zstd_get_btrfs_parameters(ZSTD_BTRFS_MAX_INPUT, + ZSTD_BTRFS_MAX_LEVEL); struct workspace *workspace; workspace = kzalloc(sizeof(*workspace), GFP_KERNEL); @@ -94,14 +101,12 @@ static int zstd_compress_pages(struct list_head *ws, int nr_pages = 0; struct page *in_page = NULL; /* The current page to read */ struct page *out_page = NULL; /* The current page to write to */ - ZSTD_inBuffer in_buf = { NULL, 0, 0 }; - ZSTD_outBuffer out_buf = { NULL, 0, 0 }; unsigned long tot_in = 0; unsigned long tot_out = 0; unsigned long len = *total_out; const unsigned long nr_dest_pages = *out_pages; unsigned long max_out = nr_dest_pages * PAGE_SIZE; - ZSTD_parameters params = zstd_get_btrfs_parameters(len); + ZSTD_parameters params = zstd_get_btrfs_parameters(len, workspace->level); *out_pages = 0; *total_out = 0; @@ -118,9 +123,9 @@ static int zstd_compress_pages(struct list_head *ws, /* map in the first page of input data */ in_page = find_get_page(mapping, start >> PAGE_SHIFT); - in_buf.src = kmap(in_page); - in_buf.pos = 0; - in_buf.size = min_t(size_t, len, PAGE_SIZE); + workspace->in_buf.src = kmap(in_page); + workspace->in_buf.pos = 0; + workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE); /* Allocate and map in the output buffer */ @@ -130,14 +135,15 @@ static int zstd_compress_pages(struct list_head *ws, goto out; } pages[nr_pages++] = out_page; - out_buf.dst = kmap(out_page); - out_buf.pos = 0; - out_buf.size = min_t(size_t, max_out, PAGE_SIZE); + workspace->out_buf.dst = kmap(out_page); + workspace->out_buf.pos = 0; + workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE); while (1) { size_t ret2; - ret2 = ZSTD_compressStream(stream, &out_buf, &in_buf); + ret2 = ZSTD_compressStream(stream, &workspace->out_buf, + &workspace->in_buf); if (ZSTD_isError(ret2)) { pr_debug("BTRFS: ZSTD_compressStream returned %d\n", ZSTD_getErrorCode(ret2)); @@ -146,22 +152,22 @@ static int zstd_compress_pages(struct list_head *ws, } /* Check to see if we are making it bigger */ - if (tot_in + in_buf.pos > 8192 && - tot_in + in_buf.pos < - tot_out + out_buf.pos) { + if (tot_in + workspace->in_buf.pos > 8192 && + tot_in + workspace->in_buf.pos < + tot_out + workspace->out_buf.pos) { ret = -E2BIG; goto out; } /* We've reached the end of our output range */ - if (out_buf.pos >= max_out) { - tot_out += out_buf.pos; + if (workspace->out_buf.pos >= max_out) { + tot_out += workspace->out_buf.pos; ret = -E2BIG; goto out; } /* Check if we need more output space */ - if (out_buf.pos == out_buf.size) { + if (workspace->out_buf.pos == workspace->out_buf.size) { tot_out += PAGE_SIZE; max_out -= PAGE_SIZE; kunmap(out_page); @@ -176,19 +182,20 @@ static int zstd_compress_pages(struct list_head *ws, goto out; } pages[nr_pages++] = out_page; - out_buf.dst = kmap(out_page); - out_buf.pos = 0; - out_buf.size = min_t(size_t, max_out, PAGE_SIZE); + workspace->out_buf.dst = kmap(out_page); + workspace->out_buf.pos = 0; + workspace->out_buf.size = min_t(size_t, max_out, + PAGE_SIZE); } /* We've reached the end of the input */ - if (in_buf.pos >= len) { - tot_in += in_buf.pos; + if (workspace->in_buf.pos >= len) { + tot_in += workspace->in_buf.pos; break; } /* Check if we need more input */ - if (in_buf.pos == in_buf.size) { + if (workspace->in_buf.pos == workspace->in_buf.size) { tot_in += PAGE_SIZE; kunmap(in_page); put_page(in_page); @@ -196,15 +203,15 @@ static int zstd_compress_pages(struct list_head *ws, start += PAGE_SIZE; len -= PAGE_SIZE; in_page = find_get_page(mapping, start >> PAGE_SHIFT); - in_buf.src = kmap(in_page); - in_buf.pos = 0; - in_buf.size = min_t(size_t, len, PAGE_SIZE); + workspace->in_buf.src = kmap(in_page); + workspace->in_buf.pos = 0; + workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE); } } while (1) { size_t ret2; - ret2 = ZSTD_endStream(stream, &out_buf); + ret2 = ZSTD_endStream(stream, &workspace->out_buf); if (ZSTD_isError(ret2)) { pr_debug("BTRFS: ZSTD_endStream returned %d\n", ZSTD_getErrorCode(ret2)); @@ -212,11 +219,11 @@ static int zstd_compress_pages(struct list_head *ws, goto out; } if (ret2 == 0) { - tot_out += out_buf.pos; + tot_out += workspace->out_buf.pos; break; } - if (out_buf.pos >= max_out) { - tot_out += out_buf.pos; + if (workspace->out_buf.pos >= max_out) { + tot_out += workspace->out_buf.pos; ret = -E2BIG; goto out; } @@ -235,9 +242,9 @@ static int zstd_compress_pages(struct list_head *ws, goto out; } pages[nr_pages++] = out_page; - out_buf.dst = kmap(out_page); - out_buf.pos = 0; - out_buf.size = min_t(size_t, max_out, PAGE_SIZE); + workspace->out_buf.dst = kmap(out_page); + workspace->out_buf.pos = 0; + workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE); } if (tot_out >= tot_in) { @@ -273,8 +280,6 @@ static int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_SIZE); unsigned long buf_start; unsigned long total_out = 0; - ZSTD_inBuffer in_buf = { NULL, 0, 0 }; - ZSTD_outBuffer out_buf = { NULL, 0, 0 }; stream = ZSTD_initDStream( ZSTD_BTRFS_MAX_INPUT, workspace->mem, workspace->size); @@ -284,18 +289,19 @@ static int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) goto done; } - in_buf.src = kmap(pages_in[page_in_index]); - in_buf.pos = 0; - in_buf.size = min_t(size_t, srclen, PAGE_SIZE); + workspace->in_buf.src = kmap(pages_in[page_in_index]); + workspace->in_buf.pos = 0; + workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE); - out_buf.dst = workspace->buf; - out_buf.pos = 0; - out_buf.size = PAGE_SIZE; + workspace->out_buf.dst = workspace->buf; + workspace->out_buf.pos = 0; + workspace->out_buf.size = PAGE_SIZE; while (1) { size_t ret2; - ret2 = ZSTD_decompressStream(stream, &out_buf, &in_buf); + ret2 = ZSTD_decompressStream(stream, &workspace->out_buf, + &workspace->in_buf); if (ZSTD_isError(ret2)) { pr_debug("BTRFS: ZSTD_decompressStream returned %d\n", ZSTD_getErrorCode(ret2)); @@ -303,38 +309,38 @@ static int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) goto done; } buf_start = total_out; - total_out += out_buf.pos; - out_buf.pos = 0; + total_out += workspace->out_buf.pos; + workspace->out_buf.pos = 0; - ret = btrfs_decompress_buf2page(out_buf.dst, buf_start, - total_out, disk_start, orig_bio); + ret = btrfs_decompress_buf2page(workspace->out_buf.dst, + buf_start, total_out, disk_start, orig_bio); if (ret == 0) break; - if (in_buf.pos >= srclen) + if (workspace->in_buf.pos >= srclen) break; /* Check if we've hit the end of a frame */ if (ret2 == 0) break; - if (in_buf.pos == in_buf.size) { + if (workspace->in_buf.pos == workspace->in_buf.size) { kunmap(pages_in[page_in_index++]); if (page_in_index >= total_pages_in) { - in_buf.src = NULL; + workspace->in_buf.src = NULL; ret = -EIO; goto done; } srclen -= PAGE_SIZE; - in_buf.src = kmap(pages_in[page_in_index]); - in_buf.pos = 0; - in_buf.size = min_t(size_t, srclen, PAGE_SIZE); + workspace->in_buf.src = kmap(pages_in[page_in_index]); + workspace->in_buf.pos = 0; + workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE); } } ret = 0; zero_fill_bio(orig_bio); done: - if (in_buf.src) + if (workspace->in_buf.src) kunmap(pages_in[page_in_index]); return ret; } @@ -348,8 +354,6 @@ static int zstd_decompress(struct list_head *ws, unsigned char *data_in, ZSTD_DStream *stream; int ret = 0; size_t ret2; - ZSTD_inBuffer in_buf = { NULL, 0, 0 }; - ZSTD_outBuffer out_buf = { NULL, 0, 0 }; unsigned long total_out = 0; unsigned long pg_offset = 0; char *kaddr; @@ -364,16 +368,17 @@ static int zstd_decompress(struct list_head *ws, unsigned char *data_in, destlen = min_t(size_t, destlen, PAGE_SIZE); - in_buf.src = data_in; - in_buf.pos = 0; - in_buf.size = srclen; + workspace->in_buf.src = data_in; + workspace->in_buf.pos = 0; + workspace->in_buf.size = srclen; - out_buf.dst = workspace->buf; - out_buf.pos = 0; - out_buf.size = PAGE_SIZE; + workspace->out_buf.dst = workspace->buf; + workspace->out_buf.pos = 0; + workspace->out_buf.size = PAGE_SIZE; ret2 = 1; - while (pg_offset < destlen && in_buf.pos < in_buf.size) { + while (pg_offset < destlen + && workspace->in_buf.pos < workspace->in_buf.size) { unsigned long buf_start; unsigned long buf_offset; unsigned long bytes; @@ -384,7 +389,8 @@ static int zstd_decompress(struct list_head *ws, unsigned char *data_in, ret = -EIO; goto finish; } - ret2 = ZSTD_decompressStream(stream, &out_buf, &in_buf); + ret2 = ZSTD_decompressStream(stream, &workspace->out_buf, + &workspace->in_buf); if (ZSTD_isError(ret2)) { pr_debug("BTRFS: ZSTD_decompressStream returned %d\n", ZSTD_getErrorCode(ret2)); @@ -393,8 +399,8 @@ static int zstd_decompress(struct list_head *ws, unsigned char *data_in, } buf_start = total_out; - total_out += out_buf.pos; - out_buf.pos = 0; + total_out += workspace->out_buf.pos; + workspace->out_buf.pos = 0; if (total_out <= start_byte) continue; @@ -405,10 +411,11 @@ static int zstd_decompress(struct list_head *ws, unsigned char *data_in, buf_offset = 0; bytes = min_t(unsigned long, destlen - pg_offset, - out_buf.size - buf_offset); + workspace->out_buf.size - buf_offset); kaddr = kmap_atomic(dest_page); - memcpy(kaddr + pg_offset, out_buf.dst + buf_offset, bytes); + memcpy(kaddr + pg_offset, workspace->out_buf.dst + buf_offset, + bytes); kunmap_atomic(kaddr); pg_offset += bytes; @@ -425,6 +432,10 @@ finish: static void zstd_set_level(struct list_head *ws, unsigned int type) { + struct workspace *workspace = list_entry(ws, struct workspace, list); + unsigned level = (type & 0xF0) >> 4; + + workspace->level = level > 0 ? level : ZSTD_BTRFS_DEFAULT_LEVEL; } const struct btrfs_compress_op btrfs_zstd_compress = { |