diff options
Diffstat (limited to 'fs/btrfs/disk-io.c')
-rw-r--r-- | fs/btrfs/disk-io.c | 479 |
1 files changed, 312 insertions, 167 deletions
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 6628fca9f4ed..e720d3e6ec20 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -101,7 +101,7 @@ int __init btrfs_end_io_wq_init(void) btrfs_end_io_wq_cache = kmem_cache_create("btrfs_end_io_wq", sizeof(struct btrfs_end_io_wq), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, + SLAB_MEM_SPREAD, NULL); if (!btrfs_end_io_wq_cache) return -ENOMEM; @@ -124,7 +124,6 @@ struct async_submit_bio { struct list_head list; extent_submit_bio_hook_t *submit_bio_start; extent_submit_bio_hook_t *submit_bio_done; - int rw; int mirror_num; unsigned long bio_flags; /* @@ -327,8 +326,7 @@ static int csum_tree_block(struct btrfs_fs_info *fs_info, read_extent_buffer(buf, &val, 0, csum_size); btrfs_warn_rl(fs_info, - "%s checksum verify failed on %llu wanted %X found %X " - "level %d", + "%s checksum verify failed on %llu wanted %X found %X level %d", fs_info->sb->s_id, buf->start, val, found, btrfs_header_level(buf)); if (result != (char *)&inline_result) @@ -403,7 +401,8 @@ out: * Return 0 if the superblock checksum type matches the checksum value of that * algorithm. Pass the raw disk superblock data. */ -static int btrfs_check_super_csum(char *raw_disk_sb) +static int btrfs_check_super_csum(struct btrfs_fs_info *fs_info, + char *raw_disk_sb) { struct btrfs_super_block *disk_sb = (struct btrfs_super_block *)raw_disk_sb; @@ -429,7 +428,7 @@ static int btrfs_check_super_csum(char *raw_disk_sb) } if (csum_type >= ARRAY_SIZE(btrfs_csum_sizes)) { - printk(KERN_ERR "BTRFS: unsupported checksum algorithm %u\n", + btrfs_err(fs_info, "unsupported checksum algorithm %u", csum_type); ret = 1; } @@ -443,7 +442,7 @@ static int btrfs_check_super_csum(char *raw_disk_sb) */ static int btree_read_extent_buffer_pages(struct btrfs_root *root, struct extent_buffer *eb, - u64 start, u64 parent_transid) + u64 parent_transid) { struct extent_io_tree *io_tree; int failed = 0; @@ -455,8 +454,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root, clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags); io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree; while (1) { - ret = read_extent_buffer_pages(io_tree, eb, start, - WAIT_COMPLETE, + ret = read_extent_buffer_pages(io_tree, eb, WAIT_COMPLETE, btree_get_extent, mirror_num); if (!ret) { if (!verify_parent_transid(io_tree, eb, @@ -548,9 +546,10 @@ static int check_tree_block_fsid(struct btrfs_fs_info *fs_info, } #define CORRUPT(reason, eb, root, slot) \ - btrfs_crit(root->fs_info, "corrupt leaf, %s: block=%llu," \ - "root=%llu, slot=%d", reason, \ - btrfs_header_bytenr(eb), root->objectid, slot) + btrfs_crit(root->fs_info, "corrupt %s, %s: block=%llu," \ + " root=%llu, slot=%d", \ + btrfs_header_level(eb) == 0 ? "leaf" : "node",\ + reason, btrfs_header_bytenr(eb), root->objectid, slot) static noinline int check_leaf(struct btrfs_root *root, struct extent_buffer *leaf) @@ -560,8 +559,29 @@ static noinline int check_leaf(struct btrfs_root *root, u32 nritems = btrfs_header_nritems(leaf); int slot; - if (nritems == 0) + if (nritems == 0) { + struct btrfs_root *check_root; + + key.objectid = btrfs_header_owner(leaf); + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; + + check_root = btrfs_get_fs_root(root->fs_info, &key, false); + /* + * The only reason we also check NULL here is that during + * open_ctree() some roots has not yet been set up. + */ + if (!IS_ERR_OR_NULL(check_root)) { + /* if leaf is the root, then it's fine */ + if (leaf->start != + btrfs_root_bytenr(&check_root->root_item)) { + CORRUPT("non-root leaf's nritems is 0", + leaf, root, 0); + return -EIO; + } + } return 0; + } /* Check the 0 item */ if (btrfs_item_offset_nr(leaf, 0) + btrfs_item_size_nr(leaf, 0) != @@ -613,6 +633,42 @@ static noinline int check_leaf(struct btrfs_root *root, return 0; } +static int check_node(struct btrfs_root *root, struct extent_buffer *node) +{ + unsigned long nr = btrfs_header_nritems(node); + struct btrfs_key key, next_key; + int slot; + u64 bytenr; + int ret = 0; + + if (nr == 0 || nr > BTRFS_NODEPTRS_PER_BLOCK(root)) { + btrfs_crit(root->fs_info, + "corrupt node: block %llu root %llu nritems %lu", + node->start, root->objectid, nr); + return -EIO; + } + + for (slot = 0; slot < nr - 1; slot++) { + bytenr = btrfs_node_blockptr(node, slot); + btrfs_node_key_to_cpu(node, &key, slot); + btrfs_node_key_to_cpu(node, &next_key, slot + 1); + + if (!bytenr) { + CORRUPT("invalid item slot", node, root, slot); + ret = -EIO; + goto out; + } + + if (btrfs_comp_cpu_keys(&key, &next_key) >= 0) { + CORRUPT("bad key order", node, root, slot); + ret = -EIO; + goto out; + } + } +out: + return ret; +} + static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, u64 phy_offset, struct page *page, u64 start, u64 end, int mirror) @@ -683,6 +739,9 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, ret = -EIO; } + if (found_level > 0 && check_node(root, eb)) + ret = -EIO; + if (!ret) set_extent_buffer_uptodate(eb); err: @@ -727,7 +786,7 @@ static void end_workqueue_bio(struct bio *bio) fs_info = end_io_wq->info; end_io_wq->error = bio->bi_error; - if (bio->bi_rw & REQ_WRITE) { + if (bio_op(bio) == REQ_OP_WRITE) { if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA) { wq = fs_info->endio_meta_write_workers; func = btrfs_endio_meta_write_helper; @@ -797,7 +856,7 @@ static void run_one_async_start(struct btrfs_work *work) int ret; async = container_of(work, struct async_submit_bio, work); - ret = async->submit_bio_start(async->inode, async->rw, async->bio, + ret = async->submit_bio_start(async->inode, async->bio, async->mirror_num, async->bio_flags, async->bio_offset); if (ret) @@ -830,9 +889,8 @@ static void run_one_async_done(struct btrfs_work *work) return; } - async->submit_bio_done(async->inode, async->rw, async->bio, - async->mirror_num, async->bio_flags, - async->bio_offset); + async->submit_bio_done(async->inode, async->bio, async->mirror_num, + async->bio_flags, async->bio_offset); } static void run_one_async_free(struct btrfs_work *work) @@ -844,7 +902,7 @@ static void run_one_async_free(struct btrfs_work *work) } int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, - int rw, struct bio *bio, int mirror_num, + struct bio *bio, int mirror_num, unsigned long bio_flags, u64 bio_offset, extent_submit_bio_hook_t *submit_bio_start, @@ -857,7 +915,6 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, return -ENOMEM; async->inode = inode; - async->rw = rw; async->bio = bio; async->mirror_num = mirror_num; async->submit_bio_start = submit_bio_start; @@ -873,7 +930,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, atomic_inc(&fs_info->nr_async_submits); - if (rw & REQ_SYNC) + if (bio->bi_opf & REQ_SYNC) btrfs_set_work_high_priority(&async->work); btrfs_queue_work(fs_info->workers, &async->work); @@ -903,9 +960,8 @@ static int btree_csum_one_bio(struct bio *bio) return ret; } -static int __btree_submit_bio_start(struct inode *inode, int rw, - struct bio *bio, int mirror_num, - unsigned long bio_flags, +static int __btree_submit_bio_start(struct inode *inode, struct bio *bio, + int mirror_num, unsigned long bio_flags, u64 bio_offset) { /* @@ -915,7 +971,7 @@ static int __btree_submit_bio_start(struct inode *inode, int rw, return btree_csum_one_bio(bio); } -static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio, +static int __btree_submit_bio_done(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags, u64 bio_offset) { @@ -925,7 +981,7 @@ static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio, * when we're called for a write, we're already in the async * submission context. Just jump into btrfs_map_bio */ - ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1); + ret = btrfs_map_bio(BTRFS_I(inode)->root, bio, mirror_num, 1); if (ret) { bio->bi_error = ret; bio_endio(bio); @@ -944,14 +1000,14 @@ static int check_async_write(struct inode *inode, unsigned long bio_flags) return 1; } -static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, +static int btree_submit_bio_hook(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags, u64 bio_offset) { int async = check_async_write(inode, bio_flags); int ret; - if (!(rw & REQ_WRITE)) { + if (bio_op(bio) != REQ_OP_WRITE) { /* * called for a read, do the setup so that checksum validation * can happen in the async kernel threads @@ -960,21 +1016,19 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, bio, BTRFS_WQ_ENDIO_METADATA); if (ret) goto out_w_error; - ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, - mirror_num, 0); + ret = btrfs_map_bio(BTRFS_I(inode)->root, bio, mirror_num, 0); } else if (!async) { ret = btree_csum_one_bio(bio); if (ret) goto out_w_error; - ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, - mirror_num, 0); + ret = btrfs_map_bio(BTRFS_I(inode)->root, bio, mirror_num, 0); } else { /* * kthread helpers are used to submit writes so that * checksumming can happen in parallel across all CPUs */ ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, - inode, rw, bio, mirror_num, 0, + inode, bio, mirror_num, 0, bio_offset, __btree_submit_bio_start, __btree_submit_bio_done); @@ -1098,10 +1152,10 @@ void readahead_tree_block(struct btrfs_root *root, u64 bytenr) struct inode *btree_inode = root->fs_info->btree_inode; buf = btrfs_find_create_tree_block(root, bytenr); - if (!buf) + if (IS_ERR(buf)) return; read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree, - buf, 0, WAIT_NONE, btree_get_extent, 0); + buf, WAIT_NONE, btree_get_extent, 0); free_extent_buffer(buf); } @@ -1114,12 +1168,12 @@ int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, int ret; buf = btrfs_find_create_tree_block(root, bytenr); - if (!buf) + if (IS_ERR(buf)) return 0; set_bit(EXTENT_BUFFER_READAHEAD, &buf->bflags); - ret = read_extent_buffer_pages(io_tree, buf, 0, WAIT_PAGE_LOCK, + ret = read_extent_buffer_pages(io_tree, buf, WAIT_PAGE_LOCK, btree_get_extent, mirror_num); if (ret) { free_extent_buffer(buf); @@ -1146,8 +1200,9 @@ struct extent_buffer *btrfs_find_tree_block(struct btrfs_fs_info *fs_info, struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, u64 bytenr) { - if (btrfs_test_is_dummy_root(root)) - return alloc_test_extent_buffer(root->fs_info, bytenr); + if (btrfs_is_testing(root->fs_info)) + return alloc_test_extent_buffer(root->fs_info, bytenr, + root->nodesize); return alloc_extent_buffer(root->fs_info, bytenr); } @@ -1171,10 +1226,10 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, int ret; buf = btrfs_find_create_tree_block(root, bytenr); - if (!buf) - return ERR_PTR(-ENOMEM); + if (IS_ERR(buf)) + return buf; - ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid); + ret = btree_read_extent_buffer_pages(root, buf, parent_transid); if (ret) { free_extent_buffer(buf); return ERR_PTR(ret); @@ -1232,6 +1287,7 @@ static void __setup_root(u32 nodesize, u32 sectorsize, u32 stripesize, struct btrfs_root *root, struct btrfs_fs_info *fs_info, u64 objectid) { + bool dummy = test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state); root->node = NULL; root->commit_root = NULL; root->sectorsize = sectorsize; @@ -1286,14 +1342,14 @@ static void __setup_root(u32 nodesize, u32 sectorsize, u32 stripesize, root->log_transid = 0; root->log_transid_committed = -1; root->last_log_commit = 0; - if (fs_info) + if (!dummy) extent_io_tree_init(&root->dirty_log_pages, fs_info->btree_inode->i_mapping); memset(&root->root_key, 0, sizeof(root->root_key)); memset(&root->root_item, 0, sizeof(root->root_item)); memset(&root->defrag_progress, 0, sizeof(root->defrag_progress)); - if (fs_info) + if (!dummy) root->defrag_trans_start = fs_info->generation; else root->defrag_trans_start = 0; @@ -1314,15 +1370,20 @@ static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info, #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS /* Should only be used by the testing infrastructure */ -struct btrfs_root *btrfs_alloc_dummy_root(void) +struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info, + u32 sectorsize, u32 nodesize) { struct btrfs_root *root; - root = btrfs_alloc_root(NULL, GFP_KERNEL); + if (!fs_info) + return ERR_PTR(-EINVAL); + + root = btrfs_alloc_root(fs_info, GFP_KERNEL); if (!root) return ERR_PTR(-ENOMEM); - __setup_root(4096, 4096, 4096, root, NULL, 1); - set_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state); + /* We don't use the stripesize in selftest, set it as sectorsize */ + __setup_root(nodesize, sectorsize, sectorsize, root, fs_info, + BTRFS_ROOT_TREE_OBJECTID); root->alloc_bytenr = 0; return root; @@ -1597,14 +1658,14 @@ int btrfs_init_fs_root(struct btrfs_root *root) ret = get_anon_bdev(&root->anon_dev); if (ret) - goto free_writers; + goto fail; mutex_lock(&root->objectid_mutex); ret = btrfs_find_highest_objectid(root, &root->highest_objectid); if (ret) { mutex_unlock(&root->objectid_mutex); - goto free_root_dev; + goto fail; } ASSERT(root->highest_objectid <= BTRFS_LAST_FREE_OBJECTID); @@ -1612,19 +1673,13 @@ int btrfs_init_fs_root(struct btrfs_root *root) mutex_unlock(&root->objectid_mutex); return 0; - -free_root_dev: - free_anon_bdev(root->anon_dev); -free_writers: - btrfs_free_subvolume_writers(root->subv_writers); fail: - kfree(root->free_ino_ctl); - kfree(root->free_ino_pinned); + /* the caller is responsible to call free_fs_root */ return ret; } -static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, - u64 root_id) +struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, + u64 root_id) { struct btrfs_root *root; @@ -1803,6 +1858,13 @@ static int cleaner_kthread(void *arg) if (btrfs_need_cleaner_sleep(root)) goto sleep; + /* + * Do not do anything if we might cause open_ctree() to block + * before we have finished mounting the filesystem. + */ + if (!test_bit(BTRFS_FS_OPEN, &root->fs_info->flags)) + goto sleep; + if (!mutex_trylock(&root->fs_info->cleaner_mutex)) goto sleep; @@ -2293,9 +2355,8 @@ static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info) fs_info->qgroup_op_tree = RB_ROOT; INIT_LIST_HEAD(&fs_info->dirty_qgroups); fs_info->qgroup_seq = 1; - fs_info->quota_enabled = 0; - fs_info->pending_quota_state = 0; fs_info->qgroup_ulist = NULL; + fs_info->qgroup_rescan_running = false; mutex_init(&fs_info->qgroup_rescan_lock); } @@ -2306,17 +2367,19 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info, unsigned int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND; fs_info->workers = - btrfs_alloc_workqueue("worker", flags | WQ_HIGHPRI, - max_active, 16); + btrfs_alloc_workqueue(fs_info, "worker", + flags | WQ_HIGHPRI, max_active, 16); fs_info->delalloc_workers = - btrfs_alloc_workqueue("delalloc", flags, max_active, 2); + btrfs_alloc_workqueue(fs_info, "delalloc", + flags, max_active, 2); fs_info->flush_workers = - btrfs_alloc_workqueue("flush_delalloc", flags, max_active, 0); + btrfs_alloc_workqueue(fs_info, "flush_delalloc", + flags, max_active, 0); fs_info->caching_workers = - btrfs_alloc_workqueue("cache", flags, max_active, 0); + btrfs_alloc_workqueue(fs_info, "cache", flags, max_active, 0); /* * a higher idle thresh on the submit workers makes it much more @@ -2324,41 +2387,48 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info, * devices */ fs_info->submit_workers = - btrfs_alloc_workqueue("submit", flags, + btrfs_alloc_workqueue(fs_info, "submit", flags, min_t(u64, fs_devices->num_devices, max_active), 64); fs_info->fixup_workers = - btrfs_alloc_workqueue("fixup", flags, 1, 0); + btrfs_alloc_workqueue(fs_info, "fixup", flags, 1, 0); /* * endios are largely parallel and should have a very * low idle thresh */ fs_info->endio_workers = - btrfs_alloc_workqueue("endio", flags, max_active, 4); + btrfs_alloc_workqueue(fs_info, "endio", flags, max_active, 4); fs_info->endio_meta_workers = - btrfs_alloc_workqueue("endio-meta", flags, max_active, 4); + btrfs_alloc_workqueue(fs_info, "endio-meta", flags, + max_active, 4); fs_info->endio_meta_write_workers = - btrfs_alloc_workqueue("endio-meta-write", flags, max_active, 2); + btrfs_alloc_workqueue(fs_info, "endio-meta-write", flags, + max_active, 2); fs_info->endio_raid56_workers = - btrfs_alloc_workqueue("endio-raid56", flags, max_active, 4); + btrfs_alloc_workqueue(fs_info, "endio-raid56", flags, + max_active, 4); fs_info->endio_repair_workers = - btrfs_alloc_workqueue("endio-repair", flags, 1, 0); + btrfs_alloc_workqueue(fs_info, "endio-repair", flags, 1, 0); fs_info->rmw_workers = - btrfs_alloc_workqueue("rmw", flags, max_active, 2); + btrfs_alloc_workqueue(fs_info, "rmw", flags, max_active, 2); fs_info->endio_write_workers = - btrfs_alloc_workqueue("endio-write", flags, max_active, 2); + btrfs_alloc_workqueue(fs_info, "endio-write", flags, + max_active, 2); fs_info->endio_freespace_worker = - btrfs_alloc_workqueue("freespace-write", flags, max_active, 0); + btrfs_alloc_workqueue(fs_info, "freespace-write", flags, + max_active, 0); fs_info->delayed_workers = - btrfs_alloc_workqueue("delayed-meta", flags, max_active, 0); + btrfs_alloc_workqueue(fs_info, "delayed-meta", flags, + max_active, 0); fs_info->readahead_workers = - btrfs_alloc_workqueue("readahead", flags, max_active, 2); + btrfs_alloc_workqueue(fs_info, "readahead", flags, + max_active, 2); fs_info->qgroup_rescan_workers = - btrfs_alloc_workqueue("qgroup-rescan", flags, 1, 0); + btrfs_alloc_workqueue(fs_info, "qgroup-rescan", flags, 1, 0); fs_info->extent_workers = - btrfs_alloc_workqueue("extent-refs", flags, + btrfs_alloc_workqueue(fs_info, "extent-refs", flags, min_t(u64, fs_devices->num_devices, max_active), 8); @@ -2469,8 +2539,7 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info, root = btrfs_read_tree_root(tree_root, &location); if (!IS_ERR(root)) { set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); - fs_info->quota_enabled = 1; - fs_info->pending_quota_state = 1; + set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); fs_info->quota_root = root; } @@ -2517,7 +2586,6 @@ int open_ctree(struct super_block *sb, int num_backups_tried = 0; int backup_index = 0; int max_active; - bool cleaner_mutex_locked = false; tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info, GFP_KERNEL); chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info, GFP_KERNEL); @@ -2614,6 +2682,7 @@ int open_ctree(struct super_block *sb, atomic_set(&fs_info->qgroup_op_seq, 0); atomic_set(&fs_info->reada_works_cnt, 0); atomic64_set(&fs_info->tree_mod_seq, 0); + fs_info->fs_frozen = 0; fs_info->sb = sb; fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE; fs_info->metadata_ratio = 0; @@ -2661,8 +2730,7 @@ int open_ctree(struct super_block *sb, extent_io_tree_init(&fs_info->freed_extents[1], fs_info->btree_inode->i_mapping); fs_info->pinned_extents = &fs_info->freed_extents[0]; - fs_info->do_barriers = 1; - + set_bit(BTRFS_FS_BARRIER, &fs_info->flags); mutex_init(&fs_info->ordered_operations_mutex); mutex_init(&fs_info->tree_log_mutex); @@ -2713,7 +2781,7 @@ int open_ctree(struct super_block *sb, * We want to check superblock checksum, the type is stored inside. * Pass the whole disk block of size BTRFS_SUPER_INFO_SIZE (4k). */ - if (btrfs_check_super_csum(bh->b_data)) { + if (btrfs_check_super_csum(fs_info, bh->b_data)) { btrfs_err(fs_info, "superblock checksum mismatch"); err = -EINVAL; brelse(bh); @@ -2797,7 +2865,7 @@ int open_ctree(struct super_block *sb, nodesize = btrfs_super_nodesize(disk_super); sectorsize = btrfs_super_sectorsize(disk_super); - stripesize = btrfs_super_stripesize(disk_super); + stripesize = sectorsize; fs_info->dirty_metadata_batch = nodesize * (1 + ilog2(nr_cpu_ids)); fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids)); @@ -2996,13 +3064,6 @@ retry_root_backup: goto fail_sysfs; } - /* - * Hold the cleaner_mutex thread here so that we don't block - * for a long time on btrfs_recover_relocation. cleaner_kthread - * will wait for us to finish mounting the filesystem. - */ - mutex_lock(&fs_info->cleaner_mutex); - cleaner_mutex_locked = true; fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root, "btrfs-cleaner"); if (IS_ERR(fs_info->cleaner_kthread)) @@ -3014,8 +3075,8 @@ retry_root_backup: if (IS_ERR(fs_info->transaction_kthread)) goto fail_cleaner; - if (!btrfs_test_opt(tree_root, SSD) && - !btrfs_test_opt(tree_root, NOSSD) && + if (!btrfs_test_opt(tree_root->fs_info, SSD) && + !btrfs_test_opt(tree_root->fs_info, NOSSD) && !fs_info->fs_devices->rotating) { btrfs_info(fs_info, "detected SSD devices, enabling SSD mode"); btrfs_set_opt(fs_info->mount_opt, SSD); @@ -3028,9 +3089,9 @@ retry_root_backup: btrfs_apply_pending_changes(fs_info); #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY - if (btrfs_test_opt(tree_root, CHECK_INTEGRITY)) { + if (btrfs_test_opt(tree_root->fs_info, CHECK_INTEGRITY)) { ret = btrfsic_mount(tree_root, fs_devices, - btrfs_test_opt(tree_root, + btrfs_test_opt(tree_root->fs_info, CHECK_INTEGRITY_INCLUDING_EXTENT_DATA) ? 1 : 0, fs_info->check_integrity_print_mask); @@ -3046,7 +3107,7 @@ retry_root_backup: /* do not make disk changes in broken FS or nologreplay is given */ if (btrfs_super_log_root(disk_super) != 0 && - !btrfs_test_opt(tree_root, NOLOGREPLAY)) { + !btrfs_test_opt(tree_root->fs_info, NOLOGREPLAY)) { ret = btrfs_replay_log(fs_info, fs_devices); if (ret) { err = ret; @@ -3062,8 +3123,10 @@ retry_root_backup: ret = btrfs_cleanup_fs_roots(fs_info); if (ret) goto fail_qgroup; - /* We locked cleaner_mutex before creating cleaner_kthread. */ + + mutex_lock(&fs_info->cleaner_mutex); ret = btrfs_recover_relocation(tree_root); + mutex_unlock(&fs_info->cleaner_mutex); if (ret < 0) { btrfs_warn(fs_info, "failed to recover relocation: %d", ret); @@ -3071,8 +3134,6 @@ retry_root_backup: goto fail_qgroup; } } - mutex_unlock(&fs_info->cleaner_mutex); - cleaner_mutex_locked = false; location.objectid = BTRFS_FS_TREE_OBJECTID; location.type = BTRFS_ROOT_ITEM_KEY; @@ -3087,7 +3148,7 @@ retry_root_backup: if (sb->s_flags & MS_RDONLY) return 0; - if (btrfs_test_opt(tree_root, FREE_SPACE_TREE) && + if (btrfs_test_opt(tree_root->fs_info, FREE_SPACE_TREE) && !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { btrfs_info(fs_info, "creating free space tree"); ret = btrfs_create_free_space_tree(fs_info); @@ -3124,7 +3185,7 @@ retry_root_backup: btrfs_qgroup_rescan_resume(fs_info); - if (btrfs_test_opt(tree_root, CLEAR_CACHE) && + if (btrfs_test_opt(tree_root->fs_info, CLEAR_CACHE) && btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { btrfs_info(fs_info, "clearing free space tree"); ret = btrfs_clear_free_space_tree(fs_info); @@ -3145,7 +3206,7 @@ retry_root_backup: close_ctree(tree_root); return ret; } - } else if (btrfs_test_opt(tree_root, RESCAN_UUID_TREE) || + } else if (btrfs_test_opt(tree_root->fs_info, RESCAN_UUID_TREE) || fs_info->generation != btrfs_super_uuid_tree_generation(disk_super)) { btrfs_info(fs_info, "checking UUID tree"); @@ -3157,10 +3218,9 @@ retry_root_backup: return ret; } } else { - fs_info->update_uuid_tree_gen = 1; + set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags); } - - fs_info->open = 1; + set_bit(BTRFS_FS_OPEN, &fs_info->flags); /* * backuproot only affect mount behavior, and if open_ctree succeeded, @@ -3186,10 +3246,6 @@ fail_cleaner: filemap_write_and_wait(fs_info->btree_inode->i_mapping); fail_sysfs: - if (cleaner_mutex_locked) { - mutex_unlock(&fs_info->cleaner_mutex); - cleaner_mutex_locked = false; - } btrfs_sysfs_remove_mounted(fs_info); fail_fsdev_sysfs: @@ -3226,7 +3282,7 @@ fail: return err; recovery_tree_root: - if (!btrfs_test_opt(tree_root, USEBACKUPROOT)) + if (!btrfs_test_opt(tree_root->fs_info, USEBACKUPROOT)) goto fail_tree_roots; free_root_pointers(fs_info, 0); @@ -3420,9 +3476,9 @@ static int write_dev_supers(struct btrfs_device *device, * to go down lazy. */ if (i == 0) - ret = btrfsic_submit_bh(WRITE_FUA, bh); + ret = btrfsic_submit_bh(REQ_OP_WRITE, WRITE_FUA, bh); else - ret = btrfsic_submit_bh(WRITE_SYNC, bh); + ret = btrfsic_submit_bh(REQ_OP_WRITE, WRITE_SYNC, bh); if (ret) errors++; } @@ -3486,12 +3542,13 @@ static int write_dev_flush(struct btrfs_device *device, int wait) bio->bi_end_io = btrfs_end_empty_barrier; bio->bi_bdev = device->bdev; + bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH); init_completion(&device->flush_wait); bio->bi_private = &device->flush_wait; device->flush_bio = bio; bio_get(bio); - btrfsic_submit_bio(WRITE_FLUSH, bio); + btrfsic_submit_bio(bio); return 0; } @@ -3568,7 +3625,7 @@ int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags) } if (min_tolerated == INT_MAX) { - pr_warn("BTRFS: unknown raid flag: %llu\n", flags); + pr_warn("BTRFS: unknown raid flag: %llu", flags); min_tolerated = 0; } @@ -3641,7 +3698,7 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors) int total_errors = 0; u64 flags; - do_barriers = !btrfs_test_opt(root, NOBARRIER); + do_barriers = !btrfs_test_opt(root->fs_info, NOBARRIER); backup_super_roots(root->fs_info); sb = root->fs_info->super_for_commit; @@ -3739,8 +3796,15 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, if (btrfs_root_refs(&root->root_item) == 0) synchronize_srcu(&fs_info->subvol_srcu); - if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) + if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { btrfs_free_log(NULL, root); + if (root->reloc_root) { + free_extent_buffer(root->reloc_root->node); + free_extent_buffer(root->reloc_root->commit_root); + btrfs_put_fs_root(root->reloc_root); + root->reloc_root = NULL; + } + } if (root->free_ino_pinned) __btrfs_remove_free_space_cache(root->free_ino_pinned); @@ -3847,11 +3911,10 @@ void close_ctree(struct btrfs_root *root) struct btrfs_fs_info *fs_info = root->fs_info; int ret; - fs_info->closing = 1; - smp_mb(); + set_bit(BTRFS_FS_CLOSING_START, &fs_info->flags); /* wait for the qgroup rescan worker to stop */ - btrfs_qgroup_wait_for_completion(fs_info); + btrfs_qgroup_wait_for_completion(fs_info, false); /* wait for the uuid_scan task to finish */ down(&fs_info->uuid_tree_rescan_sem); @@ -3893,8 +3956,7 @@ void close_ctree(struct btrfs_root *root) kthread_stop(fs_info->transaction_kthread); kthread_stop(fs_info->cleaner_kthread); - fs_info->closing = 2; - smp_mb(); + set_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags); btrfs_free_qgroup_config(fs_info); @@ -3919,13 +3981,13 @@ void close_ctree(struct btrfs_root *root) invalidate_inode_pages2(fs_info->btree_inode->i_mapping); btrfs_stop_all_workers(fs_info); - fs_info->open = 0; + clear_bit(BTRFS_FS_OPEN, &fs_info->flags); free_root_pointers(fs_info, 1); iput(fs_info->btree_inode); #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY - if (btrfs_test_opt(root, CHECK_INTEGRITY)) + if (btrfs_test_opt(root->fs_info, CHECK_INTEGRITY)) btrfsic_unmount(root, fs_info->fs_devices); #endif @@ -3990,8 +4052,7 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf) root = BTRFS_I(buf->pages[0]->mapping->host)->root; btrfs_assert_tree_locked(buf); if (transid != root->fs_info->generation) - WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, " - "found %llu running %llu\n", + WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, found %llu running %llu\n", buf->start, transid, root->fs_info->generation); was_dirty = set_extent_buffer_dirty(buf); if (!was_dirty) @@ -4042,7 +4103,7 @@ void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root) int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid) { struct btrfs_root *root = BTRFS_I(buf->pages[0]->mapping->host)->root; - return btree_read_extent_buffer_pages(root, buf, 0, parent_transid); + return btree_read_extent_buffer_pages(root, buf, parent_transid); } static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, @@ -4054,24 +4115,24 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, int ret = 0; if (btrfs_super_magic(sb) != BTRFS_MAGIC) { - printk(KERN_ERR "BTRFS: no valid FS found\n"); + btrfs_err(fs_info, "no valid FS found"); ret = -EINVAL; } if (btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP) - printk(KERN_WARNING "BTRFS: unrecognized super flag: %llu\n", + btrfs_warn(fs_info, "unrecognized super flag: %llu", btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP); if (btrfs_super_root_level(sb) >= BTRFS_MAX_LEVEL) { - printk(KERN_ERR "BTRFS: tree_root level too big: %d >= %d\n", + btrfs_err(fs_info, "tree_root level too big: %d >= %d", btrfs_super_root_level(sb), BTRFS_MAX_LEVEL); ret = -EINVAL; } if (btrfs_super_chunk_root_level(sb) >= BTRFS_MAX_LEVEL) { - printk(KERN_ERR "BTRFS: chunk_root level too big: %d >= %d\n", + btrfs_err(fs_info, "chunk_root level too big: %d >= %d", btrfs_super_chunk_root_level(sb), BTRFS_MAX_LEVEL); ret = -EINVAL; } if (btrfs_super_log_root_level(sb) >= BTRFS_MAX_LEVEL) { - printk(KERN_ERR "BTRFS: log_root level too big: %d >= %d\n", + btrfs_err(fs_info, "log_root level too big: %d >= %d", btrfs_super_log_root_level(sb), BTRFS_MAX_LEVEL); ret = -EINVAL; } @@ -4082,47 +4143,48 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, */ if (!is_power_of_2(sectorsize) || sectorsize < 4096 || sectorsize > BTRFS_MAX_METADATA_BLOCKSIZE) { - printk(KERN_ERR "BTRFS: invalid sectorsize %llu\n", sectorsize); + btrfs_err(fs_info, "invalid sectorsize %llu", sectorsize); ret = -EINVAL; } /* Only PAGE SIZE is supported yet */ if (sectorsize != PAGE_SIZE) { - printk(KERN_ERR "BTRFS: sectorsize %llu not supported yet, only support %lu\n", - sectorsize, PAGE_SIZE); + btrfs_err(fs_info, + "sectorsize %llu not supported yet, only support %lu", + sectorsize, PAGE_SIZE); ret = -EINVAL; } if (!is_power_of_2(nodesize) || nodesize < sectorsize || nodesize > BTRFS_MAX_METADATA_BLOCKSIZE) { - printk(KERN_ERR "BTRFS: invalid nodesize %llu\n", nodesize); + btrfs_err(fs_info, "invalid nodesize %llu", nodesize); ret = -EINVAL; } if (nodesize != le32_to_cpu(sb->__unused_leafsize)) { - printk(KERN_ERR "BTRFS: invalid leafsize %u, should be %llu\n", - le32_to_cpu(sb->__unused_leafsize), - nodesize); + btrfs_err(fs_info, "invalid leafsize %u, should be %llu", + le32_to_cpu(sb->__unused_leafsize), nodesize); ret = -EINVAL; } /* Root alignment check */ if (!IS_ALIGNED(btrfs_super_root(sb), sectorsize)) { - printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n", - btrfs_super_root(sb)); + btrfs_warn(fs_info, "tree_root block unaligned: %llu", + btrfs_super_root(sb)); ret = -EINVAL; } if (!IS_ALIGNED(btrfs_super_chunk_root(sb), sectorsize)) { - printk(KERN_WARNING "BTRFS: chunk_root block unaligned: %llu\n", - btrfs_super_chunk_root(sb)); + btrfs_warn(fs_info, "chunk_root block unaligned: %llu", + btrfs_super_chunk_root(sb)); ret = -EINVAL; } if (!IS_ALIGNED(btrfs_super_log_root(sb), sectorsize)) { - printk(KERN_WARNING "BTRFS: log_root block unaligned: %llu\n", - btrfs_super_log_root(sb)); + btrfs_warn(fs_info, "log_root block unaligned: %llu", + btrfs_super_log_root(sb)); ret = -EINVAL; } if (memcmp(fs_info->fsid, sb->dev_item.fsid, BTRFS_UUID_SIZE) != 0) { - printk(KERN_ERR "BTRFS: dev_item UUID does not match fsid: %pU != %pU\n", - fs_info->fsid, sb->dev_item.fsid); + btrfs_err(fs_info, + "dev_item UUID does not match fsid: %pU != %pU", + fs_info->fsid, sb->dev_item.fsid); ret = -EINVAL; } @@ -4130,17 +4192,27 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, * Hint to catch really bogus numbers, bitflips or so, more exact checks are * done later */ + if (btrfs_super_bytes_used(sb) < 6 * btrfs_super_nodesize(sb)) { + btrfs_err(fs_info, "bytes_used is too small %llu", + btrfs_super_bytes_used(sb)); + ret = -EINVAL; + } + if (!is_power_of_2(btrfs_super_stripesize(sb))) { + btrfs_err(fs_info, "invalid stripesize %u", + btrfs_super_stripesize(sb)); + ret = -EINVAL; + } if (btrfs_super_num_devices(sb) > (1UL << 31)) - printk(KERN_WARNING "BTRFS: suspicious number of devices: %llu\n", - btrfs_super_num_devices(sb)); + btrfs_warn(fs_info, "suspicious number of devices: %llu", + btrfs_super_num_devices(sb)); if (btrfs_super_num_devices(sb) == 0) { - printk(KERN_ERR "BTRFS: number of devices is 0\n"); + btrfs_err(fs_info, "number of devices is 0"); ret = -EINVAL; } if (btrfs_super_bytenr(sb) != BTRFS_SUPER_INFO_OFFSET) { - printk(KERN_ERR "BTRFS: super offset mismatch %llu != %u\n", - btrfs_super_bytenr(sb), BTRFS_SUPER_INFO_OFFSET); + btrfs_err(fs_info, "super offset mismatch %llu != %u", + btrfs_super_bytenr(sb), BTRFS_SUPER_INFO_OFFSET); ret = -EINVAL; } @@ -4149,17 +4221,17 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, * and one chunk */ if (btrfs_super_sys_array_size(sb) > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) { - printk(KERN_ERR "BTRFS: system chunk array too big %u > %u\n", - btrfs_super_sys_array_size(sb), - BTRFS_SYSTEM_CHUNK_ARRAY_SIZE); + btrfs_err(fs_info, "system chunk array too big %u > %u", + btrfs_super_sys_array_size(sb), + BTRFS_SYSTEM_CHUNK_ARRAY_SIZE); ret = -EINVAL; } if (btrfs_super_sys_array_size(sb) < sizeof(struct btrfs_disk_key) + sizeof(struct btrfs_chunk)) { - printk(KERN_ERR "BTRFS: system chunk array too small %u < %zu\n", - btrfs_super_sys_array_size(sb), - sizeof(struct btrfs_disk_key) - + sizeof(struct btrfs_chunk)); + btrfs_err(fs_info, "system chunk array too small %u < %zu", + btrfs_super_sys_array_size(sb), + sizeof(struct btrfs_disk_key) + + sizeof(struct btrfs_chunk)); ret = -EINVAL; } @@ -4168,14 +4240,16 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, * but it's still possible that it's the one that's wrong. */ if (btrfs_super_generation(sb) < btrfs_super_chunk_root_generation(sb)) - printk(KERN_WARNING - "BTRFS: suspicious: generation < chunk_root_generation: %llu < %llu\n", - btrfs_super_generation(sb), btrfs_super_chunk_root_generation(sb)); + btrfs_warn(fs_info, + "suspicious: generation < chunk_root_generation: %llu < %llu", + btrfs_super_generation(sb), + btrfs_super_chunk_root_generation(sb)); if (btrfs_super_generation(sb) < btrfs_super_cache_generation(sb) && btrfs_super_cache_generation(sb) != (u64)-1) - printk(KERN_WARNING - "BTRFS: suspicious: generation < cache_generation: %llu < %llu\n", - btrfs_super_generation(sb), btrfs_super_cache_generation(sb)); + btrfs_warn(fs_info, + "suspicious: generation < cache_generation: %llu < %llu", + btrfs_super_generation(sb), + btrfs_super_cache_generation(sb)); return ret; } @@ -4419,9 +4493,80 @@ again: return 0; } +static void btrfs_cleanup_bg_io(struct btrfs_block_group_cache *cache) +{ + struct inode *inode; + + inode = cache->io_ctl.inode; + if (inode) { + invalidate_inode_pages2(inode->i_mapping); + BTRFS_I(inode)->generation = 0; + cache->io_ctl.inode = NULL; + iput(inode); + } + btrfs_put_block_group(cache); +} + +void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans, + struct btrfs_root *root) +{ + struct btrfs_block_group_cache *cache; + + spin_lock(&cur_trans->dirty_bgs_lock); + while (!list_empty(&cur_trans->dirty_bgs)) { + cache = list_first_entry(&cur_trans->dirty_bgs, + struct btrfs_block_group_cache, + dirty_list); + if (!cache) { + btrfs_err(root->fs_info, + "orphan block group dirty_bgs list"); + spin_unlock(&cur_trans->dirty_bgs_lock); + return; + } + + if (!list_empty(&cache->io_list)) { + spin_unlock(&cur_trans->dirty_bgs_lock); + list_del_init(&cache->io_list); + btrfs_cleanup_bg_io(cache); + spin_lock(&cur_trans->dirty_bgs_lock); + } + + list_del_init(&cache->dirty_list); + spin_lock(&cache->lock); + cache->disk_cache_state = BTRFS_DC_ERROR; + spin_unlock(&cache->lock); + + spin_unlock(&cur_trans->dirty_bgs_lock); + btrfs_put_block_group(cache); + spin_lock(&cur_trans->dirty_bgs_lock); + } + spin_unlock(&cur_trans->dirty_bgs_lock); + + while (!list_empty(&cur_trans->io_bgs)) { + cache = list_first_entry(&cur_trans->io_bgs, + struct btrfs_block_group_cache, + io_list); + if (!cache) { + btrfs_err(root->fs_info, + "orphan block group on io_bgs list"); + return; + } + + list_del_init(&cache->io_list); + spin_lock(&cache->lock); + cache->disk_cache_state = BTRFS_DC_ERROR; + spin_unlock(&cache->lock); + btrfs_cleanup_bg_io(cache); + } +} + void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, struct btrfs_root *root) { + btrfs_cleanup_dirty_bgs(cur_trans, root); + ASSERT(list_empty(&cur_trans->dirty_bgs)); + ASSERT(list_empty(&cur_trans->io_bgs)); + btrfs_destroy_delayed_refs(cur_trans, root); cur_trans->state = TRANS_STATE_COMMIT_START; |