diff options
Diffstat (limited to 'fs/btrfs')
48 files changed, 4122 insertions, 1773 deletions
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 1ce06c849a86..3e36e4adc4a3 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -42,8 +42,14 @@ struct __btrfs_workqueue { /* Thresholding related variants */ atomic_t pending; - int max_active; - int current_max; + + /* Up limit of concurrency workers */ + int limit_active; + + /* Current number of concurrency workers */ + int current_active; + + /* Threshold to change current_active */ int thresh; unsigned int count; spinlock_t thres_lock; @@ -88,7 +94,7 @@ BTRFS_WORK_HELPER(scrubnc_helper); BTRFS_WORK_HELPER(scrubparity_helper); static struct __btrfs_workqueue * -__btrfs_alloc_workqueue(const char *name, unsigned int flags, int max_active, +__btrfs_alloc_workqueue(const char *name, unsigned int flags, int limit_active, int thresh) { struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS); @@ -96,26 +102,31 @@ __btrfs_alloc_workqueue(const char *name, unsigned int flags, int max_active, if (!ret) return NULL; - ret->max_active = max_active; + ret->limit_active = limit_active; atomic_set(&ret->pending, 0); if (thresh == 0) thresh = DFT_THRESHOLD; /* For low threshold, disabling threshold is a better choice */ if (thresh < DFT_THRESHOLD) { - ret->current_max = max_active; + ret->current_active = limit_active; ret->thresh = NO_THRESHOLD; } else { - ret->current_max = 1; + /* + * For threshold-able wq, let its concurrency grow on demand. + * Use minimal max_active at alloc time to reduce resource + * usage. + */ + ret->current_active = 1; ret->thresh = thresh; } if (flags & WQ_HIGHPRI) ret->normal_wq = alloc_workqueue("%s-%s-high", flags, - ret->max_active, - "btrfs", name); + ret->current_active, "btrfs", + name); else ret->normal_wq = alloc_workqueue("%s-%s", flags, - ret->max_active, "btrfs", + ret->current_active, "btrfs", name); if (!ret->normal_wq) { kfree(ret); @@ -134,7 +145,7 @@ __btrfs_destroy_workqueue(struct __btrfs_workqueue *wq); struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name, unsigned int flags, - int max_active, + int limit_active, int thresh) { struct btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS); @@ -143,14 +154,14 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name, return NULL; ret->normal = __btrfs_alloc_workqueue(name, flags & ~WQ_HIGHPRI, - max_active, thresh); + limit_active, thresh); if (!ret->normal) { kfree(ret); return NULL; } if (flags & WQ_HIGHPRI) { - ret->high = __btrfs_alloc_workqueue(name, flags, max_active, + ret->high = __btrfs_alloc_workqueue(name, flags, limit_active, thresh); if (!ret->high) { __btrfs_destroy_workqueue(ret->normal); @@ -180,7 +191,7 @@ static inline void thresh_queue_hook(struct __btrfs_workqueue *wq) */ static inline void thresh_exec_hook(struct __btrfs_workqueue *wq) { - int new_max_active; + int new_current_active; long pending; int need_change = 0; @@ -197,7 +208,7 @@ static inline void thresh_exec_hook(struct __btrfs_workqueue *wq) wq->count %= (wq->thresh / 4); if (!wq->count) goto out; - new_max_active = wq->current_max; + new_current_active = wq->current_active; /* * pending may be changed later, but it's OK since we really @@ -205,19 +216,19 @@ static inline void thresh_exec_hook(struct __btrfs_workqueue *wq) */ pending = atomic_read(&wq->pending); if (pending > wq->thresh) - new_max_active++; + new_current_active++; if (pending < wq->thresh / 2) - new_max_active--; - new_max_active = clamp_val(new_max_active, 1, wq->max_active); - if (new_max_active != wq->current_max) { + new_current_active--; + new_current_active = clamp_val(new_current_active, 1, wq->limit_active); + if (new_current_active != wq->current_active) { need_change = 1; - wq->current_max = new_max_active; + wq->current_active = new_current_active; } out: spin_unlock(&wq->thres_lock); if (need_change) { - workqueue_set_max_active(wq->normal_wq, wq->current_max); + workqueue_set_max_active(wq->normal_wq, wq->current_active); } } @@ -351,13 +362,13 @@ void btrfs_destroy_workqueue(struct btrfs_workqueue *wq) kfree(wq); } -void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int max) +void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int limit_active) { if (!wq) return; - wq->normal->max_active = max; + wq->normal->limit_active = limit_active; if (wq->high) - wq->high->max_active = max; + wq->high->limit_active = limit_active; } void btrfs_set_work_high_priority(struct btrfs_work *work) diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h index b0b093b6afec..ad4d0647d1a6 100644 --- a/fs/btrfs/async-thread.h +++ b/fs/btrfs/async-thread.h @@ -69,7 +69,7 @@ BTRFS_WORK_HELPER_PROTO(scrubparity_helper); struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name, unsigned int flags, - int max_active, + int limit_active, int thresh); void btrfs_init_work(struct btrfs_work *work, btrfs_work_func_t helper, btrfs_func_t func, diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 802fabb30e15..6dcdb2ec9211 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -206,10 +206,33 @@ static int __add_prelim_ref(struct list_head *head, u64 root_id, return -ENOMEM; ref->root_id = root_id; - if (key) + if (key) { ref->key_for_search = *key; - else + /* + * We can often find data backrefs with an offset that is too + * large (>= LLONG_MAX, maximum allowed file offset) due to + * underflows when subtracting a file's offset with the data + * offset of its corresponding extent data item. This can + * happen for example in the clone ioctl. + * So if we detect such case we set the search key's offset to + * zero to make sure we will find the matching file extent item + * at add_all_parents(), otherwise we will miss it because the + * offset taken form the backref is much larger then the offset + * of the file extent item. This can make us scan a very large + * number of file extent items, but at least it will not make + * us miss any. + * This is an ugly workaround for a behaviour that should have + * never existed, but it does and a fix for the clone ioctl + * would touch a lot of places, cause backwards incompatibility + * and would not fix the problem for extents cloned with older + * kernels. + */ + if (ref->key_for_search.type == BTRFS_EXTENT_DATA_KEY && + ref->key_for_search.offset >= LLONG_MAX) + ref->key_for_search.offset = 0; + } else { memset(&ref->key_for_search, 0, sizeof(ref->key_for_search)); + } ref->inode_list = NULL; ref->level = level; @@ -339,6 +362,12 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, goto out; } + if (btrfs_test_is_dummy_root(root)) { + srcu_read_unlock(&fs_info->subvol_srcu, index); + ret = -ENOENT; + goto out; + } + if (path->search_commit_root) root_level = btrfs_header_level(root->commit_root); else if (time_seq == (u64)-1) @@ -632,7 +661,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, struct btrfs_delayed_tree_ref *ref; ref = btrfs_delayed_node_to_tree_ref(node); - ret = __add_prelim_ref(prefs, ref->root, NULL, + ret = __add_prelim_ref(prefs, 0, NULL, ref->level + 1, ref->parent, node->bytenr, node->ref_mod * sgn, GFP_ATOMIC); @@ -664,11 +693,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, struct btrfs_delayed_data_ref *ref; ref = btrfs_delayed_node_to_data_ref(node); - - key.objectid = ref->objectid; - key.type = BTRFS_EXTENT_DATA_KEY; - key.offset = ref->offset; - ret = __add_prelim_ref(prefs, ref->root, &key, 0, + ret = __add_prelim_ref(prefs, 0, NULL, 0, ref->parent, node->bytenr, node->ref_mod * sgn, GFP_ATOMIC); break; @@ -1809,7 +1834,6 @@ static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root, int found = 0; struct extent_buffer *eb; struct btrfs_inode_extref *extref; - struct extent_buffer *leaf; u32 item_size; u32 cur_offset; unsigned long ptr; @@ -1837,9 +1861,8 @@ static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root, btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); btrfs_release_path(path); - leaf = path->nodes[0]; - item_size = btrfs_item_size_nr(leaf, slot); - ptr = btrfs_item_ptr_offset(leaf, slot); + item_size = btrfs_item_size_nr(eb, slot); + ptr = btrfs_item_ptr_offset(eb, slot); cur_offset = 0; while (cur_offset < item_size) { @@ -1853,7 +1876,7 @@ static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root, if (ret) break; - cur_offset += btrfs_inode_extref_name_len(leaf, extref); + cur_offset += btrfs_inode_extref_name_len(eb, extref); cur_offset += sizeof(*extref); } btrfs_tree_read_unlock_blocking(eb); diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 81220b2203c6..0ef5cc13fae2 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -44,8 +44,6 @@ #define BTRFS_INODE_IN_DELALLOC_LIST 9 #define BTRFS_INODE_READDIO_NEED_LOCK 10 #define BTRFS_INODE_HAS_PROPS 11 -/* DIO is ready to submit */ -#define BTRFS_INODE_DIO_READY 12 /* * The following 3 bits are meant only for the btree inode. * When any of them is set, it means an error happened while writing an diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index ce7dec88f4b8..0340c57bf377 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -343,7 +343,7 @@ static int btrfsic_process_written_superblock( struct btrfsic_state *state, struct btrfsic_block *const block, struct btrfs_super_block *const super_hdr); -static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status); +static void btrfsic_bio_end_io(struct bio *bp); static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate); static int btrfsic_is_block_ref_by_superblock(const struct btrfsic_state *state, const struct btrfsic_block *block, @@ -667,7 +667,7 @@ static int btrfsic_process_superblock(struct btrfsic_state *state, selected_super = kzalloc(sizeof(*selected_super), GFP_NOFS); if (NULL == selected_super) { printk(KERN_INFO "btrfsic: error, kmalloc failed!\n"); - return -1; + return -ENOMEM; } list_for_each_entry(device, dev_head, dev_list) { @@ -845,8 +845,8 @@ static int btrfsic_process_superblock_dev_mirror( superblock_tmp->never_written = 0; superblock_tmp->mirror_num = 1 + superblock_mirror_num; if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE) - printk_in_rcu(KERN_INFO "New initial S-block (bdev %p, %s)" - " @%llu (%s/%llu/%d)\n", + btrfs_info_in_rcu(device->dev_root->fs_info, + "new initial S-block (bdev %p, %s) @%llu (%s/%llu/%d)", superblock_bdev, rcu_str_deref(device->name), dev_bytenr, dev_state->name, dev_bytenr, @@ -1660,7 +1660,7 @@ static int btrfsic_read_block(struct btrfsic_state *state, sizeof(*block_ctx->pagev)) * num_pages, GFP_NOFS); if (!block_ctx->mem_to_free) - return -1; + return -ENOMEM; block_ctx->datav = block_ctx->mem_to_free; block_ctx->pagev = (struct page **)(block_ctx->datav + num_pages); for (i = 0; i < num_pages; i++) { @@ -2207,7 +2207,7 @@ continue_loop: goto again; } -static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status) +static void btrfsic_bio_end_io(struct bio *bp) { struct btrfsic_block *block = (struct btrfsic_block *)bp->bi_private; int iodone_w_error; @@ -2215,7 +2215,7 @@ static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status) /* mutex is not held! This is not save if IO is not yet completed * on umount */ iodone_w_error = 0; - if (bio_error_status) + if (bp->bi_error) iodone_w_error = 1; BUG_ON(NULL == block); @@ -2230,7 +2230,7 @@ static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status) BTRFSIC_PRINT_MASK_END_IO_BIO_BH)) printk(KERN_INFO "bio_end_io(err=%d) for %c @%llu (%s/%llu/%d)\n", - bio_error_status, + bp->bi_error, btrfsic_get_block_type(dev_state->state, block), block->logical_bytenr, dev_state->name, block->dev_bytenr, block->mirror_num); @@ -2252,7 +2252,7 @@ static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status) block = next_block; } while (NULL != block); - bp->bi_end_io(bp, bio_error_status); + bp->bi_end_io(bp); } static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index ce62324c78e7..97b049ad0594 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -97,10 +97,7 @@ static inline int compressed_bio_size(struct btrfs_root *root, static struct bio *compressed_bio_alloc(struct block_device *bdev, u64 first_byte, gfp_t gfp_flags) { - int nr_vecs; - - nr_vecs = bio_get_nr_vecs(bdev); - return btrfs_bio_alloc(bdev, first_byte >> 9, nr_vecs, gfp_flags); + return btrfs_bio_alloc(bdev, first_byte >> 9, BIO_MAX_PAGES, gfp_flags); } static int check_compressed_csum(struct inode *inode, @@ -152,7 +149,7 @@ fail: * The compressed pages are freed here, and it must be run * in process context */ -static void end_compressed_bio_read(struct bio *bio, int err) +static void end_compressed_bio_read(struct bio *bio) { struct compressed_bio *cb = bio->bi_private; struct inode *inode; @@ -160,7 +157,7 @@ static void end_compressed_bio_read(struct bio *bio, int err) unsigned long index; int ret; - if (err) + if (bio->bi_error) cb->errors = 1; /* if there are more bios still pending for this compressed @@ -210,7 +207,7 @@ csum_failed: bio_for_each_segment_all(bvec, cb->orig_bio, i) SetPageChecked(bvec->bv_page); - bio_endio(cb->orig_bio, 0); + bio_endio(cb->orig_bio); } /* finally free the cb struct */ @@ -266,7 +263,7 @@ static noinline void end_compressed_writeback(struct inode *inode, * This also calls the writeback end hooks for the file pages so that * metadata and checksums can be updated in the file. */ -static void end_compressed_bio_write(struct bio *bio, int err) +static void end_compressed_bio_write(struct bio *bio) { struct extent_io_tree *tree; struct compressed_bio *cb = bio->bi_private; @@ -274,7 +271,7 @@ static void end_compressed_bio_write(struct bio *bio, int err) struct page *page; unsigned long index; - if (err) + if (bio->bi_error) cb->errors = 1; /* if there are more bios still pending for this compressed @@ -293,7 +290,7 @@ static void end_compressed_bio_write(struct bio *bio, int err) cb->start, cb->start + cb->len - 1, NULL, - err ? 0 : 1); + bio->bi_error ? 0 : 1); cb->compressed_pages[0]->mapping = NULL; end_compressed_writeback(inode, cb); @@ -697,8 +694,10 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0); - if (ret) - bio_endio(comp_bio, ret); + if (ret) { + bio->bi_error = ret; + bio_endio(comp_bio); + } bio_put(comp_bio); @@ -724,8 +723,10 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, } ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0); - if (ret) - bio_endio(comp_bio, ret); + if (ret) { + bio->bi_error = ret; + bio_endio(comp_bio); + } bio_put(comp_bio); return 0; @@ -744,11 +745,13 @@ out: return ret; } -static struct list_head comp_idle_workspace[BTRFS_COMPRESS_TYPES]; -static spinlock_t comp_workspace_lock[BTRFS_COMPRESS_TYPES]; -static int comp_num_workspace[BTRFS_COMPRESS_TYPES]; -static atomic_t comp_alloc_workspace[BTRFS_COMPRESS_TYPES]; -static wait_queue_head_t comp_workspace_wait[BTRFS_COMPRESS_TYPES]; +static struct { + struct list_head idle_ws; + spinlock_t ws_lock; + int num_ws; + atomic_t alloc_ws; + wait_queue_head_t ws_wait; +} btrfs_comp_ws[BTRFS_COMPRESS_TYPES]; static const struct btrfs_compress_op * const btrfs_compress_op[] = { &btrfs_zlib_compress, @@ -760,10 +763,10 @@ void __init btrfs_init_compress(void) int i; for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) { - INIT_LIST_HEAD(&comp_idle_workspace[i]); - spin_lock_init(&comp_workspace_lock[i]); - atomic_set(&comp_alloc_workspace[i], 0); - init_waitqueue_head(&comp_workspace_wait[i]); + INIT_LIST_HEAD(&btrfs_comp_ws[i].idle_ws); + spin_lock_init(&btrfs_comp_ws[i].ws_lock); + atomic_set(&btrfs_comp_ws[i].alloc_ws, 0); + init_waitqueue_head(&btrfs_comp_ws[i].ws_wait); } } @@ -777,38 +780,38 @@ static struct list_head *find_workspace(int type) int cpus = num_online_cpus(); int idx = type - 1; - struct list_head *idle_workspace = &comp_idle_workspace[idx]; - spinlock_t *workspace_lock = &comp_workspace_lock[idx]; - atomic_t *alloc_workspace = &comp_alloc_workspace[idx]; - wait_queue_head_t *workspace_wait = &comp_workspace_wait[idx]; - int *num_workspace = &comp_num_workspace[idx]; + struct list_head *idle_ws = &btrfs_comp_ws[idx].idle_ws; + spinlock_t *ws_lock = &btrfs_comp_ws[idx].ws_lock; + atomic_t *alloc_ws = &btrfs_comp_ws[idx].alloc_ws; + wait_queue_head_t *ws_wait = &btrfs_comp_ws[idx].ws_wait; + int *num_ws = &btrfs_comp_ws[idx].num_ws; again: - spin_lock(workspace_lock); - if (!list_empty(idle_workspace)) { - workspace = idle_workspace->next; + spin_lock(ws_lock); + if (!list_empty(idle_ws)) { + workspace = idle_ws->next; list_del(workspace); - (*num_workspace)--; - spin_unlock(workspace_lock); + (*num_ws)--; + spin_unlock(ws_lock); return workspace; } - if (atomic_read(alloc_workspace) > cpus) { + if (atomic_read(alloc_ws) > cpus) { DEFINE_WAIT(wait); - spin_unlock(workspace_lock); - prepare_to_wait(workspace_wait, &wait, TASK_UNINTERRUPTIBLE); - if (atomic_read(alloc_workspace) > cpus && !*num_workspace) + spin_unlock(ws_lock); + prepare_to_wait(ws_wait, &wait, TASK_UNINTERRUPTIBLE); + if (atomic_read(alloc_ws) > cpus && !*num_ws) schedule(); - finish_wait(workspace_wait, &wait); + finish_wait(ws_wait, &wait); goto again; } - atomic_inc(alloc_workspace); - spin_unlock(workspace_lock); + atomic_inc(alloc_ws); + spin_unlock(ws_lock); workspace = btrfs_compress_op[idx]->alloc_workspace(); if (IS_ERR(workspace)) { - atomic_dec(alloc_workspace); - wake_up(workspace_wait); + atomic_dec(alloc_ws); + wake_up(ws_wait); } return workspace; } @@ -820,27 +823,30 @@ again: static void free_workspace(int type, struct list_head *workspace) { int idx = type - 1; - struct list_head *idle_workspace = &comp_idle_workspace[idx]; - spinlock_t *workspace_lock = &comp_workspace_lock[idx]; - atomic_t *alloc_workspace = &comp_alloc_workspace[idx]; - wait_queue_head_t *workspace_wait = &comp_workspace_wait[idx]; - int *num_workspace = &comp_num_workspace[idx]; - - spin_lock(workspace_lock); - if (*num_workspace < num_online_cpus()) { - list_add(workspace, idle_workspace); - (*num_workspace)++; - spin_unlock(workspace_lock); + struct list_head *idle_ws = &btrfs_comp_ws[idx].idle_ws; + spinlock_t *ws_lock = &btrfs_comp_ws[idx].ws_lock; + atomic_t *alloc_ws = &btrfs_comp_ws[idx].alloc_ws; + wait_queue_head_t *ws_wait = &btrfs_comp_ws[idx].ws_wait; + int *num_ws = &btrfs_comp_ws[idx].num_ws; + + spin_lock(ws_lock); + if (*num_ws < num_online_cpus()) { + list_add(workspace, idle_ws); + (*num_ws)++; + spin_unlock(ws_lock); goto wake; } - spin_unlock(workspace_lock); + spin_unlock(ws_lock); btrfs_compress_op[idx]->free_workspace(workspace); - atomic_dec(alloc_workspace); + atomic_dec(alloc_ws); wake: + /* + * Make sure counter is updated before we wake up waiters. + */ smp_mb(); - if (waitqueue_active(workspace_wait)) - wake_up(workspace_wait); + if (waitqueue_active(ws_wait)) + wake_up(ws_wait); } /* @@ -852,11 +858,11 @@ static void free_workspaces(void) int i; for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) { - while (!list_empty(&comp_idle_workspace[i])) { - workspace = comp_idle_workspace[i].next; + while (!list_empty(&btrfs_comp_ws[i].idle_ws)) { + workspace = btrfs_comp_ws[i].idle_ws.next; list_del(workspace); btrfs_compress_op[i]->free_workspace(workspace); - atomic_dec(&comp_alloc_workspace[i]); + atomic_dec(&btrfs_comp_ws[i].alloc_ws); } } } diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 54114b4887dd..5b8e235c4b6d 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1011,7 +1011,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, return ret; if (refs == 0) { ret = -EROFS; - btrfs_std_error(root->fs_info, ret); + btrfs_std_error(root->fs_info, ret, NULL); return ret; } } else { @@ -1159,8 +1159,10 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) { ret = btrfs_reloc_cow_block(trans, root, buf, cow); - if (ret) + if (ret) { + btrfs_abort_transaction(trans, root, ret); return ret; + } } if (buf == root->node) { @@ -1925,7 +1927,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, child = read_node_slot(root, mid, 0); if (!child) { ret = -EROFS; - btrfs_std_error(root->fs_info, ret); + btrfs_std_error(root->fs_info, ret, NULL); goto enospc; } @@ -2028,7 +2030,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, */ if (!left) { ret = -EROFS; - btrfs_std_error(root->fs_info, ret); + btrfs_std_error(root->fs_info, ret, NULL); goto enospc; } wret = balance_node_right(trans, root, mid, left); @@ -4938,8 +4940,8 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, { struct extent_buffer *leaf; struct btrfs_item *item; - int last_off; - int dsize = 0; + u32 last_off; + u32 dsize = 0; int ret = 0; int wret; int i; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index aac314e14188..a2e73f6053a8 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -823,8 +823,18 @@ struct btrfs_disk_balance_args { */ __le64 profiles; - /* usage filter */ - __le64 usage; + /* + * usage filter + * BTRFS_BALANCE_ARGS_USAGE with a single value means '0..N' + * BTRFS_BALANCE_ARGS_USAGE_RANGE - range syntax, min..max + */ + union { + __le64 usage; + struct { + __le32 usage_min; + __le32 usage_max; + }; + }; /* devid filter */ __le64 devid; @@ -846,10 +856,27 @@ struct btrfs_disk_balance_args { /* BTRFS_BALANCE_ARGS_* */ __le64 flags; - /* BTRFS_BALANCE_ARGS_LIMIT value */ - __le64 limit; + /* + * BTRFS_BALANCE_ARGS_LIMIT with value 'limit' + * BTRFS_BALANCE_ARGS_LIMIT_RANGE - the extend version can use minimum + * and maximum + */ + union { + __le64 limit; + struct { + __le32 limit_min; + __le32 limit_max; + }; + }; - __le64 unused[7]; + /* + * Process chunks that cross stripes_min..stripes_max devices, + * BTRFS_BALANCE_ARGS_STRIPES_RANGE + */ + __le32 stripes_min; + __le32 stripes_max; + + __le64 unused[6]; } __attribute__ ((__packed__)); /* @@ -1154,6 +1181,10 @@ struct btrfs_space_info { delalloc/allocations */ u64 bytes_readonly; /* total bytes that are read only */ + u64 max_extent_size; /* This will hold the maximum extent size of + the space info if we had an ENOSPC in the + allocator. */ + unsigned int full:1; /* indicates that we cannot allocate any more chunks for this space */ unsigned int chunk_alloc:1; /* set if we are allocating a chunk */ @@ -1228,6 +1259,9 @@ struct btrfs_free_cluster { /* first extent starting offset */ u64 window_start; + /* We did a full search and couldn't create a cluster */ + bool fragmented; + struct btrfs_block_group_cache *block_group; /* * when a cluster is allocated from a block group, we put the @@ -1300,7 +1334,7 @@ struct btrfs_block_group_cache { /* for raid56, this is a full stripe, without parity */ unsigned long full_stripe_len; - unsigned int ro:1; + unsigned int ro; unsigned int iref:1; unsigned int has_caching_ctl:1; unsigned int removed:1; @@ -1518,12 +1552,6 @@ struct btrfs_fs_info { */ struct mutex ordered_operations_mutex; - /* - * Same as ordered_operations_mutex except this is for ordered extents - * and not the operations. - */ - struct mutex ordered_extent_flush_mutex; - struct rw_semaphore commit_root_sem; struct rw_semaphore cleanup_work_sem; @@ -1949,6 +1977,9 @@ struct btrfs_root { int send_in_progress; struct btrfs_subvolume_writers *subv_writers; atomic_t will_be_snapshoted; + + /* For qgroup metadata space reserve */ + atomic_t qgroup_meta_rsv; }; struct btrfs_ioctl_defrag_range_args { @@ -2151,6 +2182,8 @@ struct btrfs_ioctl_defrag_range_args { #define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21) #define BTRFS_MOUNT_PANIC_ON_FATAL_ERROR (1 << 22) #define BTRFS_MOUNT_RESCAN_UUID_TREE (1 << 23) +#define BTRFS_MOUNT_FRAGMENT_DATA (1 << 24) +#define BTRFS_MOUNT_FRAGMENT_METADATA (1 << 25) #define BTRFS_DEFAULT_COMMIT_INTERVAL (30) #define BTRFS_DEFAULT_MAX_INLINE (8192) @@ -2175,6 +2208,18 @@ struct btrfs_ioctl_defrag_range_args { btrfs_clear_opt(root->fs_info->mount_opt, opt); \ } +#ifdef CONFIG_BTRFS_DEBUG +static inline int +btrfs_should_fragment_free_space(struct btrfs_root *root, + struct btrfs_block_group_cache *block_group) +{ + return (btrfs_test_opt(root, FRAGMENT_METADATA) && + block_group->flags & BTRFS_BLOCK_GROUP_METADATA) || + (btrfs_test_opt(root, FRAGMENT_DATA) && + block_group->flags & BTRFS_BLOCK_GROUP_DATA); +} +#endif + /* * Requests for changes that need to be done during transaction commit. * @@ -3385,7 +3430,8 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 root_objectid, u64 owner, - u64 offset, struct btrfs_key *ins); + u64 offset, u64 ram_bytes, + struct btrfs_key *ins); int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 root_objectid, u64 owner, u64 offset, @@ -3404,7 +3450,7 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, - u64 owner, u64 offset, int no_quota); + u64 owner, u64 offset); int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len, int delalloc); @@ -3417,7 +3463,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, - u64 root_objectid, u64 owner, u64 offset, int no_quota); + u64 root_objectid, u64 owner, u64 offset); int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans, struct btrfs_root *root); @@ -3437,6 +3483,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 group_start, struct extent_map *em); void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info); +void btrfs_get_block_group_trimming(struct btrfs_block_group_cache *cache); +void btrfs_put_block_group_trimming(struct btrfs_block_group_cache *cache); void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, struct btrfs_root *root); u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data); @@ -3453,8 +3501,11 @@ enum btrfs_reserve_flush_enum { BTRFS_RESERVE_FLUSH_ALL, }; -int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 write_bytes); -void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes); +int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len); +int btrfs_alloc_data_chunk_ondemand(struct inode *inode, u64 bytes); +void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len); +void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, + u64 len); void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, struct btrfs_root *root); void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans); @@ -3470,8 +3521,8 @@ void btrfs_subvolume_release_metadata(struct btrfs_root *root, u64 qgroup_reserved); int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes); void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes); -int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes); -void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes); +int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len); +void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len); void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type); struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root, unsigned short type); @@ -3495,9 +3546,9 @@ int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, void btrfs_block_rsv_release(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, u64 num_bytes); -int btrfs_set_block_group_ro(struct btrfs_root *root, +int btrfs_inc_block_group_ro(struct btrfs_root *root, struct btrfs_block_group_cache *cache); -void btrfs_set_block_group_rw(struct btrfs_root *root, +void btrfs_dec_block_group_ro(struct btrfs_root *root, struct btrfs_block_group_cache *cache); void btrfs_put_block_group_cache(struct btrfs_fs_info *info); u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo); @@ -4008,8 +4059,8 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, /* sysfs.c */ int btrfs_init_sysfs(void); void btrfs_exit_sysfs(void); -int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info); -void btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info); +int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info); +void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info); /* xattr.c */ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size); @@ -4043,14 +4094,102 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) #define btrfs_info(fs_info, fmt, args...) \ btrfs_printk(fs_info, KERN_INFO fmt, ##args) +/* + * Wrappers that use printk_in_rcu + */ +#define btrfs_emerg_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_in_rcu(fs_info, KERN_EMERG fmt, ##args) +#define btrfs_alert_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_in_rcu(fs_info, KERN_ALERT fmt, ##args) +#define btrfs_crit_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_in_rcu(fs_info, KERN_CRIT fmt, ##args) +#define btrfs_err_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_in_rcu(fs_info, KERN_ERR fmt, ##args) +#define btrfs_warn_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_in_rcu(fs_info, KERN_WARNING fmt, ##args) +#define btrfs_notice_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_in_rcu(fs_info, KERN_NOTICE fmt, ##args) +#define btrfs_info_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_in_rcu(fs_info, KERN_INFO fmt, ##args) + +/* + * Wrappers that use a ratelimited printk_in_rcu + */ +#define btrfs_emerg_rl_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_rl_in_rcu(fs_info, KERN_EMERG fmt, ##args) +#define btrfs_alert_rl_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_rl_in_rcu(fs_info, KERN_ALERT fmt, ##args) +#define btrfs_crit_rl_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_rl_in_rcu(fs_info, KERN_CRIT fmt, ##args) +#define btrfs_err_rl_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_rl_in_rcu(fs_info, KERN_ERR fmt, ##args) +#define btrfs_warn_rl_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_rl_in_rcu(fs_info, KERN_WARNING fmt, ##args) +#define btrfs_notice_rl_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_rl_in_rcu(fs_info, KERN_NOTICE fmt, ##args) +#define btrfs_info_rl_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_rl_in_rcu(fs_info, KERN_INFO fmt, ##args) + +/* + * Wrappers that use a ratelimited printk + */ +#define btrfs_emerg_rl(fs_info, fmt, args...) \ + btrfs_printk_ratelimited(fs_info, KERN_EMERG fmt, ##args) +#define btrfs_alert_rl(fs_info, fmt, args...) \ + btrfs_printk_ratelimited(fs_info, KERN_ALERT fmt, ##args) +#define btrfs_crit_rl(fs_info, fmt, args...) \ + btrfs_printk_ratelimited(fs_info, KERN_CRIT fmt, ##args) +#define btrfs_err_rl(fs_info, fmt, args...) \ + btrfs_printk_ratelimited(fs_info, KERN_ERR fmt, ##args) +#define btrfs_warn_rl(fs_info, fmt, args...) \ + btrfs_printk_ratelimited(fs_info, KERN_WARNING fmt, ##args) +#define btrfs_notice_rl(fs_info, fmt, args...) \ + btrfs_printk_ratelimited(fs_info, KERN_NOTICE fmt, ##args) +#define btrfs_info_rl(fs_info, fmt, args...) \ + btrfs_printk_ratelimited(fs_info, KERN_INFO fmt, ##args) #ifdef DEBUG #define btrfs_debug(fs_info, fmt, args...) \ btrfs_printk(fs_info, KERN_DEBUG fmt, ##args) +#define btrfs_debug_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args) +#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_rl_in_rcu(fs_info, KERN_DEBUG fmt, ##args) +#define btrfs_debug_rl(fs_info, fmt, args...) \ + btrfs_printk_ratelimited(fs_info, KERN_DEBUG fmt, ##args) #else #define btrfs_debug(fs_info, fmt, args...) \ no_printk(KERN_DEBUG fmt, ##args) +#define btrfs_debug_in_rcu(fs_info, fmt, args...) \ + no_printk(KERN_DEBUG fmt, ##args) +#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \ + no_printk(KERN_DEBUG fmt, ##args) +#define btrfs_debug_rl(fs_info, fmt, args...) \ + no_printk(KERN_DEBUG fmt, ##args) #endif +#define btrfs_printk_in_rcu(fs_info, fmt, args...) \ +do { \ + rcu_read_lock(); \ + btrfs_printk(fs_info, fmt, ##args); \ + rcu_read_unlock(); \ +} while (0) + +#define btrfs_printk_ratelimited(fs_info, fmt, args...) \ +do { \ + static DEFINE_RATELIMIT_STATE(_rs, \ + DEFAULT_RATELIMIT_INTERVAL, \ + DEFAULT_RATELIMIT_BURST); \ + if (__ratelimit(&_rs)) \ + btrfs_printk(fs_info, fmt, ##args); \ +} while (0) + +#define btrfs_printk_rl_in_rcu(fs_info, fmt, args...) \ +do { \ + rcu_read_lock(); \ + btrfs_printk_ratelimited(fs_info, fmt, ##args); \ + rcu_read_unlock(); \ +} while (0) + #ifdef CONFIG_BTRFS_ASSERT __cold @@ -4073,6 +4212,7 @@ __cold void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, unsigned int line, int errno, const char *fmt, ...); +const char *btrfs_decode_error(int errno); __cold void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, @@ -4130,14 +4270,7 @@ do { \ __LINE__, (errno)); \ } while (0) -#define btrfs_std_error(fs_info, errno) \ -do { \ - if ((errno)) \ - __btrfs_std_error((fs_info), __func__, \ - __LINE__, (errno), NULL); \ -} while (0) - -#define btrfs_error(fs_info, errno, fmt, args...) \ +#define btrfs_std_error(fs_info, errno, fmt, args...) \ do { \ __btrfs_std_error((fs_info), __func__, __LINE__, \ (errno), fmt, ##args); \ @@ -4185,8 +4318,7 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len); int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, struct extent_buffer *cow); -void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans, - struct btrfs_pending_snapshot *pending, +void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending, u64 *bytes_to_reserve); int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, struct btrfs_pending_snapshot *pending); diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index a2ae42720a6a..e0941fbb913c 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -463,6 +463,10 @@ static int __btrfs_add_delayed_deletion_item(struct btrfs_delayed_node *node, static void finish_one_item(struct btrfs_delayed_root *delayed_root) { int seq = atomic_inc_return(&delayed_root->items_seq); + + /* + * atomic_dec_return implies a barrier for waitqueue_active + */ if ((atomic_dec_return(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND || seq % BTRFS_DELAYED_BATCH == 0) && waitqueue_active(&delayed_root->wait)) diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index ac3e81da6d4e..e06dd75ad13f 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -197,6 +197,119 @@ static inline void drop_delayed_ref(struct btrfs_trans_handle *trans, trans->delayed_ref_updates--; } +static bool merge_ref(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_head *head, + struct btrfs_delayed_ref_node *ref, + u64 seq) +{ + struct btrfs_delayed_ref_node *next; + bool done = false; + + next = list_first_entry(&head->ref_list, struct btrfs_delayed_ref_node, + list); + while (!done && &next->list != &head->ref_list) { + int mod; + struct btrfs_delayed_ref_node *next2; + + next2 = list_next_entry(next, list); + + if (next == ref) + goto next; + + if (seq && next->seq >= seq) + goto next; + + if (next->type != ref->type) + goto next; + + if ((ref->type == BTRFS_TREE_BLOCK_REF_KEY || + ref->type == BTRFS_SHARED_BLOCK_REF_KEY) && + comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref), + btrfs_delayed_node_to_tree_ref(next), + ref->type)) + goto next; + if ((ref->type == BTRFS_EXTENT_DATA_REF_KEY || + ref->type == BTRFS_SHARED_DATA_REF_KEY) && + comp_data_refs(btrfs_delayed_node_to_data_ref(ref), + btrfs_delayed_node_to_data_ref(next))) + goto next; + + if (ref->action == next->action) { + mod = next->ref_mod; + } else { + if (ref->ref_mod < next->ref_mod) { + swap(ref, next); + done = true; + } + mod = -next->ref_mod; + } + + drop_delayed_ref(trans, delayed_refs, head, next); + ref->ref_mod += mod; + if (ref->ref_mod == 0) { + drop_delayed_ref(trans, delayed_refs, head, ref); + done = true; + } else { + /* + * Can't have multiples of the same ref on a tree block. + */ + WARN_ON(ref->type == BTRFS_TREE_BLOCK_REF_KEY || + ref->type == BTRFS_SHARED_BLOCK_REF_KEY); + } +next: + next = next2; + } + + return done; +} + +void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_head *head) +{ + struct btrfs_delayed_ref_node *ref; + u64 seq = 0; + + assert_spin_locked(&head->lock); + + if (list_empty(&head->ref_list)) + return; + + /* We don't have too many refs to merge for data. */ + if (head->is_data) + return; + + spin_lock(&fs_info->tree_mod_seq_lock); + if (!list_empty(&fs_info->tree_mod_seq_list)) { + struct seq_list *elem; + + elem = list_first_entry(&fs_info->tree_mod_seq_list, + struct seq_list, list); + seq = elem->seq; + } + spin_unlock(&fs_info->tree_mod_seq_lock); + + ref = list_first_entry(&head->ref_list, struct btrfs_delayed_ref_node, + list); + while (&ref->list != &head->ref_list) { + if (seq && ref->seq >= seq) + goto next; + + if (merge_ref(trans, delayed_refs, head, ref, seq)) { + if (list_empty(&head->ref_list)) + break; + ref = list_first_entry(&head->ref_list, + struct btrfs_delayed_ref_node, + list); + continue; + } +next: + ref = list_next_entry(ref, list); + } +} + int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, u64 seq) @@ -292,8 +405,7 @@ add_delayed_ref_tail_merge(struct btrfs_trans_handle *trans, exist = list_entry(href->ref_list.prev, struct btrfs_delayed_ref_node, list); /* No need to compare bytenr nor is_head */ - if (exist->type != ref->type || exist->no_quota != ref->no_quota || - exist->seq != ref->seq) + if (exist->type != ref->type || exist->seq != ref->seq) goto add_tail; if ((exist->type == BTRFS_TREE_BLOCK_REF_KEY || @@ -423,7 +535,8 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *ref, struct btrfs_qgroup_extent_record *qrecord, - u64 bytenr, u64 num_bytes, int action, int is_data) + u64 bytenr, u64 num_bytes, u64 ref_root, u64 reserved, + int action, int is_data) { struct btrfs_delayed_ref_head *existing; struct btrfs_delayed_ref_head *head_ref = NULL; @@ -432,6 +545,9 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, int count_mod = 1; int must_insert_reserved = 0; + /* If reserved is provided, it must be a data extent. */ + BUG_ON(!is_data && reserved); + /* * the head node stores the sum of all the mods, so dropping a ref * should drop the sum in the head node by one. @@ -476,9 +592,16 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, INIT_LIST_HEAD(&head_ref->ref_list); head_ref->processing = 0; head_ref->total_ref_mod = count_mod; + head_ref->qgroup_reserved = 0; + head_ref->qgroup_ref_root = 0; /* Record qgroup extent info if provided */ if (qrecord) { + if (ref_root && reserved) { + head_ref->qgroup_ref_root = ref_root; + head_ref->qgroup_reserved = reserved; + } + qrecord->bytenr = bytenr; qrecord->num_bytes = num_bytes; qrecord->old_roots = NULL; @@ -497,6 +620,8 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, existing = htree_insert(&delayed_refs->href_root, &head_ref->href_node); if (existing) { + WARN_ON(ref_root && reserved && existing->qgroup_ref_root + && existing->qgroup_reserved); update_existing_head_ref(delayed_refs, &existing->node, ref); /* * we've updated the existing ref, free the newly @@ -524,7 +649,7 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_head *head_ref, struct btrfs_delayed_ref_node *ref, u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, int level, - int action, int no_quota) + int action) { struct btrfs_delayed_tree_ref *full_ref; struct btrfs_delayed_ref_root *delayed_refs; @@ -546,7 +671,6 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info, ref->action = action; ref->is_head = 0; ref->in_tree = 1; - ref->no_quota = no_quota; ref->seq = seq; full_ref = btrfs_delayed_node_to_tree_ref(ref); @@ -579,7 +703,7 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_head *head_ref, struct btrfs_delayed_ref_node *ref, u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, u64 owner, - u64 offset, int action, int no_quota) + u64 offset, int action) { struct btrfs_delayed_data_ref *full_ref; struct btrfs_delayed_ref_root *delayed_refs; @@ -602,7 +726,6 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info, ref->action = action; ref->is_head = 0; ref->in_tree = 1; - ref->no_quota = no_quota; ref->seq = seq; full_ref = btrfs_delayed_node_to_data_ref(ref); @@ -633,17 +756,13 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, int level, int action, - struct btrfs_delayed_extent_op *extent_op, - int no_quota) + struct btrfs_delayed_extent_op *extent_op) { struct btrfs_delayed_tree_ref *ref; struct btrfs_delayed_ref_head *head_ref; struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_qgroup_extent_record *record = NULL; - if (!is_fstree(ref_root) || !fs_info->quota_enabled) - no_quota = 0; - BUG_ON(extent_op && extent_op->is_data); ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS); if (!ref) @@ -669,11 +788,10 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, * the spin lock */ head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, record, - bytenr, num_bytes, action, 0); + bytenr, num_bytes, 0, 0, action, 0); add_delayed_tree_ref(fs_info, trans, head_ref, &ref->node, bytenr, - num_bytes, parent, ref_root, level, action, - no_quota); + num_bytes, parent, ref_root, level, action); spin_unlock(&delayed_refs->lock); return 0; @@ -693,18 +811,14 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, - u64 owner, u64 offset, int action, - struct btrfs_delayed_extent_op *extent_op, - int no_quota) + u64 owner, u64 offset, u64 reserved, int action, + struct btrfs_delayed_extent_op *extent_op) { struct btrfs_delayed_data_ref *ref; struct btrfs_delayed_ref_head *head_ref; struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_qgroup_extent_record *record = NULL; - if (!is_fstree(ref_root) || !fs_info->quota_enabled) - no_quota = 0; - BUG_ON(extent_op && !extent_op->is_data); ref = kmem_cache_alloc(btrfs_delayed_data_ref_cachep, GFP_NOFS); if (!ref) @@ -736,16 +850,44 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, * the spin lock */ head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, record, - bytenr, num_bytes, action, 1); + bytenr, num_bytes, ref_root, reserved, + action, 1); add_delayed_data_ref(fs_info, trans, head_ref, &ref->node, bytenr, num_bytes, parent, ref_root, owner, offset, - action, no_quota); + action); spin_unlock(&delayed_refs->lock); return 0; } +int btrfs_add_delayed_qgroup_reserve(struct btrfs_fs_info *fs_info, + struct btrfs_trans_handle *trans, + u64 ref_root, u64 bytenr, u64 num_bytes) +{ + struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_delayed_ref_head *ref_head; + int ret = 0; + + if (!fs_info->quota_enabled || !is_fstree(ref_root)) + return 0; + + delayed_refs = &trans->transaction->delayed_refs; + + spin_lock(&delayed_refs->lock); + ref_head = find_ref_head(&delayed_refs->href_root, bytenr, 0); + if (!ref_head) { + ret = -ENOENT; + goto out; + } + WARN_ON(ref_head->qgroup_reserved || ref_head->qgroup_ref_root); + ref_head->qgroup_ref_root = ref_root; + ref_head->qgroup_reserved = num_bytes; +out: + spin_unlock(&delayed_refs->lock); + return ret; +} + int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, @@ -764,7 +906,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, spin_lock(&delayed_refs->lock); add_delayed_ref_head(fs_info, trans, &head_ref->node, NULL, bytenr, - num_bytes, BTRFS_UPDATE_DELAYED_HEAD, + num_bytes, 0, 0, BTRFS_UPDATE_DELAYED_HEAD, extent_op->is_data); spin_unlock(&delayed_refs->lock); diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 13fb5e6090fe..00ed02cbf3e9 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -68,7 +68,6 @@ struct btrfs_delayed_ref_node { unsigned int action:8; unsigned int type:8; - unsigned int no_quota:1; /* is this node still in the rbtree? */ unsigned int is_head:1; unsigned int in_tree:1; @@ -113,6 +112,17 @@ struct btrfs_delayed_ref_head { int total_ref_mod; /* + * For qgroup reserved space freeing. + * + * ref_root and reserved will be recorded after + * BTRFS_ADD_DELAYED_EXTENT is called. + * And will be used to free reserved qgroup space at + * run_delayed_refs() time. + */ + u64 qgroup_ref_root; + u64 qgroup_reserved; + + /* * when a new extent is allocated, it is just reserved in memory * The actual extent isn't inserted into the extent allocation tree * until the delayed ref is processed. must_insert_reserved is @@ -233,15 +243,16 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, int level, int action, - struct btrfs_delayed_extent_op *extent_op, - int no_quota); + struct btrfs_delayed_extent_op *extent_op); int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, - u64 owner, u64 offset, int action, - struct btrfs_delayed_extent_op *extent_op, - int no_quota); + u64 owner, u64 offset, u64 reserved, int action, + struct btrfs_delayed_extent_op *extent_op); +int btrfs_add_delayed_qgroup_reserve(struct btrfs_fs_info *fs_info, + struct btrfs_trans_handle *trans, + u64 ref_root, u64 bytenr, u64 num_bytes); int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 564a7de17d99..1e668fb7dd4c 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -183,8 +183,7 @@ no_valid_dev_replace_entry_found: } out: - if (path) - btrfs_free_path(path); + btrfs_free_path(path); return ret; } @@ -328,19 +327,6 @@ int btrfs_dev_replace_start(struct btrfs_root *root, args->start.tgtdev_name[0] == '\0') return -EINVAL; - /* - * Here we commit the transaction to make sure commit_total_bytes - * of all the devices are updated. - */ - trans = btrfs_attach_transaction(root); - if (!IS_ERR(trans)) { - ret = btrfs_commit_transaction(trans, root); - if (ret) - return ret; - } else if (PTR_ERR(trans) != -ENOENT) { - return PTR_ERR(trans); - } - /* the disk copy procedure reuses the scrub code */ mutex_lock(&fs_info->volume_mutex); ret = btrfs_dev_replace_find_srcdev(root, args->start.srcdevid, @@ -357,6 +343,19 @@ int btrfs_dev_replace_start(struct btrfs_root *root, if (ret) return ret; + /* + * Here we commit the transaction to make sure commit_total_bytes + * of all the devices are updated. + */ + trans = btrfs_attach_transaction(root); + if (!IS_ERR(trans)) { + ret = btrfs_commit_transaction(trans, root); + if (ret) + return ret; + } else if (PTR_ERR(trans) != -ENOENT) { + return PTR_ERR(trans); + } + btrfs_dev_replace_lock(dev_replace); switch (dev_replace->replace_state) { case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: @@ -376,12 +375,8 @@ int btrfs_dev_replace_start(struct btrfs_root *root, WARN_ON(!tgt_device); dev_replace->tgtdev = tgt_device; - ret = btrfs_kobj_add_device(tgt_device->fs_devices, tgt_device); - if (ret) - btrfs_err(root->fs_info, "kobj add dev failed %d\n", ret); - - printk_in_rcu(KERN_INFO - "BTRFS: dev_replace from %s (devid %llu) to %s started\n", + btrfs_info_in_rcu(root->fs_info, + "dev_replace from %s (devid %llu) to %s started", src_device->missing ? "<missing disk>" : rcu_str_deref(src_device->name), src_device->devid, @@ -402,6 +397,10 @@ int btrfs_dev_replace_start(struct btrfs_root *root, args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; btrfs_dev_replace_unlock(dev_replace); + ret = btrfs_sysfs_add_device_link(tgt_device->fs_devices, tgt_device); + if (ret) + btrfs_err(root->fs_info, "kobj add dev failed %d\n", ret); + btrfs_wait_ordered_roots(root->fs_info, -1); /* force writing the updated state information to disk */ @@ -455,8 +454,7 @@ static void btrfs_rm_dev_replace_blocked(struct btrfs_fs_info *fs_info) static void btrfs_rm_dev_replace_unblocked(struct btrfs_fs_info *fs_info) { clear_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state); - if (waitqueue_active(&fs_info->replace_wait)) - wake_up(&fs_info->replace_wait); + wake_up(&fs_info->replace_wait); } static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, @@ -524,8 +522,8 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, src_device, tgt_device); } else { - printk_in_rcu(KERN_ERR - "BTRFS: btrfs_scrub_dev(%s, %llu, %s) failed %d\n", + btrfs_err_in_rcu(root->fs_info, + "btrfs_scrub_dev(%s, %llu, %s) failed %d", src_device->missing ? "<missing disk>" : rcu_str_deref(src_device->name), src_device->devid, @@ -541,8 +539,8 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, return scrub_ret; } - printk_in_rcu(KERN_INFO - "BTRFS: dev_replace from %s (devid %llu) to %s finished\n", + btrfs_info_in_rcu(root->fs_info, + "dev_replace from %s (devid %llu) to %s finished", src_device->missing ? "<missing disk>" : rcu_str_deref(src_device->name), src_device->devid, @@ -587,7 +585,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, mutex_unlock(&uuid_mutex); /* replace the sysfs entry */ - btrfs_kobj_rm_device(fs_info->fs_devices, src_device); + btrfs_sysfs_rm_device_link(fs_info->fs_devices, src_device); btrfs_rm_dev_replace_free_srcdev(fs_info, src_device); /* write back the superblocks */ @@ -810,8 +808,8 @@ static int btrfs_dev_replace_kthread(void *data) progress = status_args->status.progress_1000; kfree(status_args); progress = div_u64(progress, 10); - printk_in_rcu(KERN_INFO - "BTRFS: continuing dev_replace from %s (devid %llu) to %s @%u%%\n", + btrfs_info_in_rcu(fs_info, + "continuing dev_replace from %s (devid %llu) to %s @%u%%", dev_replace->srcdev->missing ? "<missing disk>" : rcu_str_deref(dev_replace->srcdev->name), dev_replace->srcdev->devid, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index f556c3732c2c..2d4667594681 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -319,9 +319,9 @@ static int csum_tree_block(struct btrfs_fs_info *fs_info, memcpy(&found, result, csum_size); read_extent_buffer(buf, &val, 0, csum_size); - printk_ratelimited(KERN_WARNING - "BTRFS: %s checksum verify failed on %llu wanted %X found %X " - "level %d\n", + btrfs_warn_rl(fs_info, + "%s checksum verify failed on %llu wanted %X found %X " + "level %d", fs_info->sb->s_id, buf->start, val, found, btrfs_header_level(buf)); if (result != (char *)&inline_result) @@ -368,9 +368,9 @@ static int verify_parent_transid(struct extent_io_tree *io_tree, ret = 0; goto out; } - printk_ratelimited(KERN_ERR - "BTRFS (device %s): parent transid verify failed on %llu wanted %llu found %llu\n", - eb->fs_info->sb->s_id, eb->start, + btrfs_err_rl(eb->fs_info, + "parent transid verify failed on %llu wanted %llu found %llu", + eb->start, parent_transid, btrfs_header_generation(eb)); ret = 1; @@ -629,15 +629,14 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, found_start = btrfs_header_bytenr(eb); if (found_start != eb->start) { - printk_ratelimited(KERN_ERR "BTRFS (device %s): bad tree block start " - "%llu %llu\n", - eb->fs_info->sb->s_id, found_start, eb->start); + btrfs_err_rl(eb->fs_info, "bad tree block start %llu %llu", + found_start, eb->start); ret = -EIO; goto err; } if (check_tree_block_fsid(root->fs_info, eb)) { - printk_ratelimited(KERN_ERR "BTRFS (device %s): bad fsid on block %llu\n", - eb->fs_info->sb->s_id, eb->start); + btrfs_err_rl(eb->fs_info, "bad fsid on block %llu", + eb->start); ret = -EIO; goto err; } @@ -703,7 +702,7 @@ static int btree_io_failed_hook(struct page *page, int failed_mirror) return -EIO; /* we fixed nothing */ } -static void end_workqueue_bio(struct bio *bio, int err) +static void end_workqueue_bio(struct bio *bio) { struct btrfs_end_io_wq *end_io_wq = bio->bi_private; struct btrfs_fs_info *fs_info; @@ -711,7 +710,7 @@ static void end_workqueue_bio(struct bio *bio, int err) btrfs_work_func_t func; fs_info = end_io_wq->info; - end_io_wq->error = err; + end_io_wq->error = bio->bi_error; if (bio->bi_rw & REQ_WRITE) { if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA) { @@ -802,13 +801,17 @@ static void run_one_async_done(struct btrfs_work *work) limit = btrfs_async_submit_limit(fs_info); limit = limit * 2 / 3; + /* + * atomic_dec_return implies a barrier for waitqueue_active + */ if (atomic_dec_return(&fs_info->nr_async_submits) < limit && waitqueue_active(&fs_info->async_submit_wait)) wake_up(&fs_info->async_submit_wait); /* If an error occured we just want to clean up the bio and move on */ if (async->error) { - bio_endio(async->bio, async->error); + async->bio->bi_error = async->error; + bio_endio(async->bio); return; } @@ -908,8 +911,10 @@ static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio, * submission context. Just jump into btrfs_map_bio */ ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1); - if (ret) - bio_endio(bio, ret); + if (ret) { + bio->bi_error = ret; + bio_endio(bio); + } return ret; } @@ -960,10 +965,13 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, __btree_submit_bio_done); } - if (ret) { + if (ret) + goto out_w_error; + return 0; + out_w_error: - bio_endio(bio, ret); - } + bio->bi_error = ret; + bio_endio(bio); return ret; } @@ -1259,6 +1267,7 @@ static void __setup_root(u32 nodesize, u32 sectorsize, u32 stripesize, atomic_set(&root->orphan_inodes, 0); atomic_set(&root->refs, 1); atomic_set(&root->will_be_snapshoted, 0); + atomic_set(&root->qgroup_meta_rsv, 0); root->log_transid = 0; root->log_transid_committed = -1; root->last_log_commit = 0; @@ -1724,6 +1733,7 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi) bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE; bdi->congested_fn = btrfs_congested_fn; bdi->congested_data = info; + bdi->capabilities |= BDI_CAP_CGROUP_WRITEBACK; return 0; } @@ -1735,16 +1745,15 @@ static void end_workqueue_fn(struct btrfs_work *work) { struct bio *bio; struct btrfs_end_io_wq *end_io_wq; - int error; end_io_wq = container_of(work, struct btrfs_end_io_wq, work); bio = end_io_wq->bio; - error = end_io_wq->error; + bio->bi_error = end_io_wq->error; bio->bi_private = end_io_wq->private; bio->bi_end_io = end_io_wq->end_io; kmem_cache_free(btrfs_end_io_wq_cache, end_io_wq); - bio_endio(bio, error); + bio_endio(bio); } static int cleaner_kthread(void *arg) @@ -1753,6 +1762,7 @@ static int cleaner_kthread(void *arg) int again; struct btrfs_trans_handle *trans; + set_freezable(); do { again = 0; @@ -2342,8 +2352,7 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info, u64 bytenr = btrfs_super_log_root(disk_super); if (fs_devices->rw_devices == 0) { - printk(KERN_WARNING "BTRFS: log replay required " - "on RO media\n"); + btrfs_warn(fs_info, "log replay required on RO media"); return -EIO; } @@ -2358,12 +2367,12 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info, log_tree_root->node = read_tree_block(tree_root, bytenr, fs_info->generation + 1); if (IS_ERR(log_tree_root->node)) { - printk(KERN_ERR "BTRFS: failed to read log tree\n"); + btrfs_warn(fs_info, "failed to read log tree"); ret = PTR_ERR(log_tree_root->node); kfree(log_tree_root); return ret; } else if (!extent_buffer_uptodate(log_tree_root->node)) { - printk(KERN_ERR "BTRFS: failed to read log tree\n"); + btrfs_err(fs_info, "failed to read log tree"); free_extent_buffer(log_tree_root->node); kfree(log_tree_root); return -EIO; @@ -2371,7 +2380,7 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info, /* returns with log_tree_root freed on success */ ret = btrfs_recover_log_trees(log_tree_root); if (ret) { - btrfs_error(tree_root->fs_info, ret, + btrfs_std_error(tree_root->fs_info, ret, "Failed to recover log tree"); free_extent_buffer(log_tree_root->node); kfree(log_tree_root); @@ -2608,7 +2617,6 @@ int open_ctree(struct super_block *sb, mutex_init(&fs_info->ordered_operations_mutex); - mutex_init(&fs_info->ordered_extent_flush_mutex); mutex_init(&fs_info->tree_log_mutex); mutex_init(&fs_info->chunk_mutex); mutex_init(&fs_info->transaction_kthread_mutex); @@ -2648,8 +2656,8 @@ int open_ctree(struct super_block *sb, * Read super block and check the signature bytes only */ bh = btrfs_read_dev_super(fs_devices->latest_bdev); - if (!bh) { - err = -EINVAL; + if (IS_ERR(bh)) { + err = PTR_ERR(bh); goto fail_alloc; } @@ -2842,6 +2850,8 @@ int open_ctree(struct super_block *sb, !extent_buffer_uptodate(chunk_root->node)) { printk(KERN_ERR "BTRFS: failed to read chunk root on %s\n", sb->s_id); + if (!IS_ERR(chunk_root->node)) + free_extent_buffer(chunk_root->node); chunk_root->node = NULL; goto fail_tree_roots; } @@ -2880,6 +2890,8 @@ retry_root_backup: !extent_buffer_uptodate(tree_root->node)) { printk(KERN_WARNING "BTRFS: failed to read tree root on %s\n", sb->s_id); + if (!IS_ERR(tree_root->node)) + free_extent_buffer(tree_root->node); tree_root->node = NULL; goto recovery_tree_root; } @@ -2928,7 +2940,7 @@ retry_root_backup: goto fail_fsdev_sysfs; } - ret = btrfs_sysfs_add_one(fs_info); + ret = btrfs_sysfs_add_mounted(fs_info); if (ret) { pr_err("BTRFS: failed to init sysfs interface: %d\n", ret); goto fail_fsdev_sysfs; @@ -2950,8 +2962,9 @@ retry_root_backup: if (fs_info->fs_devices->missing_devices > fs_info->num_tolerated_disk_barrier_failures && !(sb->s_flags & MS_RDONLY)) { - printk(KERN_WARNING "BTRFS: " - "too many missing devices, writeable mount is not allowed\n"); + pr_warn("BTRFS: missing devices(%llu) exceeds the limit(%d), writeable mount is not allowed\n", + fs_info->fs_devices->missing_devices, + fs_info->num_tolerated_disk_barrier_failures); goto fail_sysfs; } @@ -3107,7 +3120,7 @@ fail_cleaner: filemap_write_and_wait(fs_info->btree_inode->i_mapping); fail_sysfs: - btrfs_sysfs_remove_one(fs_info); + btrfs_sysfs_remove_mounted(fs_info); fail_fsdev_sysfs: btrfs_sysfs_remove_fsid(fs_info->fs_devices); @@ -3169,8 +3182,8 @@ static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate) struct btrfs_device *device = (struct btrfs_device *) bh->b_private; - printk_ratelimited_in_rcu(KERN_WARNING "BTRFS: lost page write due to " - "I/O error on %s\n", + btrfs_warn_rl_in_rcu(device->dev_root->fs_info, + "lost page write due to IO error on %s", rcu_str_deref(device->name)); /* note, we dont' set_buffer_write_io_error because we have * our own ways of dealing with the IO errors @@ -3182,6 +3195,37 @@ static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate) put_bh(bh); } +int btrfs_read_dev_one_super(struct block_device *bdev, int copy_num, + struct buffer_head **bh_ret) +{ + struct buffer_head *bh; + struct btrfs_super_block *super; + u64 bytenr; + + bytenr = btrfs_sb_offset(copy_num); + if (bytenr + BTRFS_SUPER_INFO_SIZE >= i_size_read(bdev->bd_inode)) + return -EINVAL; + + bh = __bread(bdev, bytenr / 4096, BTRFS_SUPER_INFO_SIZE); + /* + * If we fail to read from the underlying devices, as of now + * the best option we have is to mark it EIO. + */ + if (!bh) + return -EIO; + + super = (struct btrfs_super_block *)bh->b_data; + if (btrfs_super_bytenr(super) != bytenr || + btrfs_super_magic(super) != BTRFS_MAGIC) { + brelse(bh); + return -EINVAL; + } + + *bh_ret = bh; + return 0; +} + + struct buffer_head *btrfs_read_dev_super(struct block_device *bdev) { struct buffer_head *bh; @@ -3189,7 +3233,7 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev) struct btrfs_super_block *super; int i; u64 transid = 0; - u64 bytenr; + int ret = -EINVAL; /* we would like to check all the supers, but that would make * a btrfs mount succeed after a mkfs from a different FS. @@ -3197,21 +3241,11 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev) * later supers, using BTRFS_SUPER_MIRROR_MAX instead */ for (i = 0; i < 1; i++) { - bytenr = btrfs_sb_offset(i); - if (bytenr + BTRFS_SUPER_INFO_SIZE >= - i_size_read(bdev->bd_inode)) - break; - bh = __bread(bdev, bytenr / 4096, - BTRFS_SUPER_INFO_SIZE); - if (!bh) + ret = btrfs_read_dev_one_super(bdev, i, &bh); + if (ret) continue; super = (struct btrfs_super_block *)bh->b_data; - if (btrfs_super_bytenr(super) != bytenr || - btrfs_super_magic(super) != BTRFS_MAGIC) { - brelse(bh); - continue; - } if (!latest || btrfs_super_generation(super) > transid) { brelse(latest); @@ -3221,6 +3255,10 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev) brelse(bh); } } + + if (!latest) + return ERR_PTR(ret); + return latest; } @@ -3289,8 +3327,9 @@ static int write_dev_supers(struct btrfs_device *device, bh = __getblk(device->bdev, bytenr / 4096, BTRFS_SUPER_INFO_SIZE); if (!bh) { - printk(KERN_ERR "BTRFS: couldn't get super " - "buffer head for bytenr %Lu\n", bytenr); + btrfs_err(device->dev_root->fs_info, + "couldn't get super buffer head for bytenr %llu", + bytenr); errors++; continue; } @@ -3324,10 +3363,8 @@ static int write_dev_supers(struct btrfs_device *device, * endio for the write_dev_flush, this will wake anyone waiting * for the barrier when it is done */ -static void btrfs_end_empty_barrier(struct bio *bio, int err) +static void btrfs_end_empty_barrier(struct bio *bio) { - if (err) - clear_bit(BIO_UPTODATE, &bio->bi_flags); if (bio->bi_private) complete(bio->bi_private); bio_put(bio); @@ -3355,8 +3392,8 @@ static int write_dev_flush(struct btrfs_device *device, int wait) wait_for_completion(&device->flush_wait); - if (!bio_flagged(bio, BIO_UPTODATE)) { - ret = -EIO; + if (bio->bi_error) { + ret = bio->bi_error; btrfs_dev_stat_inc_and_print(device, BTRFS_DEV_STAT_FLUSH_ERRS); } @@ -3439,6 +3476,35 @@ static int barrier_all_devices(struct btrfs_fs_info *info) return 0; } +int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags) +{ + int raid_type; + int min_tolerated = INT_MAX; + + if ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 || + (flags & BTRFS_AVAIL_ALLOC_BIT_SINGLE)) + min_tolerated = min(min_tolerated, + btrfs_raid_array[BTRFS_RAID_SINGLE]. + tolerated_failures); + + for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) { + if (raid_type == BTRFS_RAID_SINGLE) + continue; + if (!(flags & btrfs_raid_group[raid_type])) + continue; + min_tolerated = min(min_tolerated, + btrfs_raid_array[raid_type]. + tolerated_failures); + } + + if (min_tolerated == INT_MAX) { + pr_warn("BTRFS: unknown raid flag: %llu\n", flags); + min_tolerated = 0; + } + + return min_tolerated; +} + int btrfs_calc_num_tolerated_disk_barrier_failures( struct btrfs_fs_info *fs_info) { @@ -3448,13 +3514,12 @@ int btrfs_calc_num_tolerated_disk_barrier_failures( BTRFS_BLOCK_GROUP_SYSTEM, BTRFS_BLOCK_GROUP_METADATA, BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA}; - int num_types = 4; int i; int c; int num_tolerated_disk_barrier_failures = (int)fs_info->fs_devices->num_devices; - for (i = 0; i < num_types; i++) { + for (i = 0; i < ARRAY_SIZE(types); i++) { struct btrfs_space_info *tmp; sinfo = NULL; @@ -3472,44 +3537,21 @@ int btrfs_calc_num_tolerated_disk_barrier_failures( down_read(&sinfo->groups_sem); for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) { - if (!list_empty(&sinfo->block_groups[c])) { - u64 flags; - - btrfs_get_block_group_info( - &sinfo->block_groups[c], &space); - if (space.total_bytes == 0 || - space.used_bytes == 0) - continue; - flags = space.flags; - /* - * return - * 0: if dup, single or RAID0 is configured for - * any of metadata, system or data, else - * 1: if RAID5 is configured, or if RAID1 or - * RAID10 is configured and only two mirrors - * are used, else - * 2: if RAID6 is configured, else - * num_mirrors - 1: if RAID1 or RAID10 is - * configured and more than - * 2 mirrors are used. - */ - if (num_tolerated_disk_barrier_failures > 0 && - ((flags & (BTRFS_BLOCK_GROUP_DUP | - BTRFS_BLOCK_GROUP_RAID0)) || - ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) - == 0))) - num_tolerated_disk_barrier_failures = 0; - else if (num_tolerated_disk_barrier_failures > 1) { - if (flags & (BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID5 | - BTRFS_BLOCK_GROUP_RAID10)) { - num_tolerated_disk_barrier_failures = 1; - } else if (flags & - BTRFS_BLOCK_GROUP_RAID6) { - num_tolerated_disk_barrier_failures = 2; - } - } - } + u64 flags; + + if (list_empty(&sinfo->block_groups[c])) + continue; + + btrfs_get_block_group_info(&sinfo->block_groups[c], + &space); + if (space.total_bytes == 0 || space.used_bytes == 0) + continue; + flags = space.flags; + + num_tolerated_disk_barrier_failures = min( + num_tolerated_disk_barrier_failures, + btrfs_get_num_tolerated_disk_barrier_failures( + flags)); } up_read(&sinfo->groups_sem); } @@ -3544,7 +3586,7 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors) if (ret) { mutex_unlock( &root->fs_info->fs_devices->device_list_mutex); - btrfs_error(root->fs_info, ret, + btrfs_std_error(root->fs_info, ret, "errors while submitting device barriers."); return ret; } @@ -3584,7 +3626,7 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors) mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); /* FUA is masked off if unsupported and can't be the reason */ - btrfs_error(root->fs_info, -EIO, + btrfs_std_error(root->fs_info, -EIO, "%d errors while writing supers", total_errors); return -EIO; } @@ -3602,7 +3644,7 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors) } mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); if (total_errors > max_errors) { - btrfs_error(root->fs_info, -EIO, + btrfs_std_error(root->fs_info, -EIO, "%d errors while writing supers", total_errors); return -EIO; } @@ -3760,6 +3802,13 @@ void close_ctree(struct btrfs_root *root) cancel_work_sync(&fs_info->async_reclaim_work); if (!(fs_info->sb->s_flags & MS_RDONLY)) { + /* + * If the cleaner thread is stopped and there are + * block groups queued for removal, the deletion will be + * skipped when we quit the cleaner thread. + */ + btrfs_delete_unused_bgs(root->fs_info); + ret = btrfs_commit_super(root); if (ret) btrfs_err(fs_info, "commit super ret %d", ret); @@ -3781,7 +3830,7 @@ void close_ctree(struct btrfs_root *root) percpu_counter_sum(&fs_info->delalloc_bytes)); } - btrfs_sysfs_remove_one(fs_info); + btrfs_sysfs_remove_mounted(fs_info); btrfs_sysfs_remove_fsid(fs_info->fs_devices); btrfs_free_fs_roots(fs_info); @@ -4279,25 +4328,6 @@ again: return 0; } -static void btrfs_free_pending_ordered(struct btrfs_transaction *cur_trans, - struct btrfs_fs_info *fs_info) -{ - struct btrfs_ordered_extent *ordered; - - spin_lock(&fs_info->trans_lock); - while (!list_empty(&cur_trans->pending_ordered)) { - ordered = list_first_entry(&cur_trans->pending_ordered, - struct btrfs_ordered_extent, - trans_list); - list_del_init(&ordered->trans_list); - spin_unlock(&fs_info->trans_lock); - - btrfs_put_ordered_extent(ordered); - spin_lock(&fs_info->trans_lock); - } - spin_unlock(&fs_info->trans_lock); -} - void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, struct btrfs_root *root) { @@ -4309,7 +4339,6 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, cur_trans->state = TRANS_STATE_UNBLOCKED; wake_up(&root->fs_info->transaction_wait); - btrfs_free_pending_ordered(cur_trans, root->fs_info); btrfs_destroy_delayed_inodes(root); btrfs_assert_delayed_root_empty(root); diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index d4cbfeeeedd4..adeb31830b9c 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -60,6 +60,8 @@ void close_ctree(struct btrfs_root *root); int write_ctree_super(struct btrfs_trans_handle *trans, struct btrfs_root *root, int max_mirrors); struct buffer_head *btrfs_read_dev_super(struct block_device *bdev); +int btrfs_read_dev_one_super(struct block_device *bdev, int copy_num, + struct buffer_head **bh_ret); int btrfs_commit_super(struct btrfs_root *root); struct extent_buffer *btrfs_find_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr); @@ -139,6 +141,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, u64 objectid); int btree_lock_page_hook(struct page *page, void *data, void (*flush_fn)(void *)); +int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags); int btrfs_calc_num_tolerated_disk_barrier_failures( struct btrfs_fs_info *fs_info); int __init btrfs_end_io_wq_init(void); diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index 8d052209f473..2513a7f53334 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -112,11 +112,11 @@ static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh, u32 generation; if (fh_type == FILEID_BTRFS_WITH_PARENT) { - if (fh_len != BTRFS_FID_SIZE_CONNECTABLE) + if (fh_len < BTRFS_FID_SIZE_CONNECTABLE) return NULL; root_objectid = fid->root_objectid; } else if (fh_type == FILEID_BTRFS_WITH_PARENT_ROOT) { - if (fh_len != BTRFS_FID_SIZE_CONNECTABLE_ROOT) + if (fh_len < BTRFS_FID_SIZE_CONNECTABLE_ROOT) return NULL; root_objectid = fid->parent_root_objectid; } else @@ -136,11 +136,11 @@ static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh, u32 generation; if ((fh_type != FILEID_BTRFS_WITH_PARENT || - fh_len != BTRFS_FID_SIZE_CONNECTABLE) && + fh_len < BTRFS_FID_SIZE_CONNECTABLE) && (fh_type != FILEID_BTRFS_WITH_PARENT_ROOT || - fh_len != BTRFS_FID_SIZE_CONNECTABLE_ROOT) && + fh_len < BTRFS_FID_SIZE_CONNECTABLE_ROOT) && (fh_type != FILEID_BTRFS_WITHOUT_PARENT || - fh_len != BTRFS_FID_SIZE_NON_CONNECTABLE)) + fh_len < BTRFS_FID_SIZE_NON_CONNECTABLE)) return NULL; objectid = fid->objectid; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 07204bf601ed..99a8e57da8a1 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -95,8 +95,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 parent, u64 root_objectid, u64 flags, struct btrfs_disk_key *key, - int level, struct btrfs_key *ins, - int no_quota); + int level, struct btrfs_key *ins); static int do_chunk_alloc(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root, u64 flags, int force); @@ -332,6 +331,27 @@ static void put_caching_control(struct btrfs_caching_control *ctl) kfree(ctl); } +#ifdef CONFIG_BTRFS_DEBUG +static void fragment_free_space(struct btrfs_root *root, + struct btrfs_block_group_cache *block_group) +{ + u64 start = block_group->key.objectid; + u64 len = block_group->key.offset; + u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ? + root->nodesize : root->sectorsize; + u64 step = chunk << 1; + + while (len > chunk) { + btrfs_remove_free_space(block_group, start, chunk); + start += step; + if (len < step) + len = 0; + else + len -= step; + } +} +#endif + /* * this is only called by cache_block_group, since we could have freed extents * we need to check the pinned_extents for any extents that can't be used yet @@ -388,6 +408,7 @@ static noinline void caching_thread(struct btrfs_work *work) u64 last = 0; u32 nritems; int ret = -ENOMEM; + bool wakeup = true; caching_ctl = container_of(work, struct btrfs_caching_control, work); block_group = caching_ctl->block_group; @@ -400,6 +421,15 @@ static noinline void caching_thread(struct btrfs_work *work) last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); +#ifdef CONFIG_BTRFS_DEBUG + /* + * If we're fragmenting we don't want to make anybody think we can + * allocate from this block group until we've had a chance to fragment + * the free space. + */ + if (btrfs_should_fragment_free_space(extent_root, block_group)) + wakeup = false; +#endif /* * We don't want to deadlock with somebody trying to allocate a new * extent for the extent root while also trying to search the extent @@ -441,7 +471,8 @@ next: if (need_resched() || rwsem_is_contended(&fs_info->commit_root_sem)) { - caching_ctl->progress = last; + if (wakeup) + caching_ctl->progress = last; btrfs_release_path(path); up_read(&fs_info->commit_root_sem); mutex_unlock(&caching_ctl->mutex); @@ -464,7 +495,8 @@ next: key.offset = 0; key.type = BTRFS_EXTENT_ITEM_KEY; - caching_ctl->progress = last; + if (wakeup) + caching_ctl->progress = last; btrfs_release_path(path); goto next; } @@ -491,7 +523,8 @@ next: if (total_found > (1024 * 1024 * 2)) { total_found = 0; - wake_up(&caching_ctl->wait); + if (wakeup) + wake_up(&caching_ctl->wait); } } path->slots[0]++; @@ -501,13 +534,27 @@ next: total_found += add_new_free_space(block_group, fs_info, last, block_group->key.objectid + block_group->key.offset); - caching_ctl->progress = (u64)-1; - spin_lock(&block_group->lock); block_group->caching_ctl = NULL; block_group->cached = BTRFS_CACHE_FINISHED; spin_unlock(&block_group->lock); +#ifdef CONFIG_BTRFS_DEBUG + if (btrfs_should_fragment_free_space(extent_root, block_group)) { + u64 bytes_used; + + spin_lock(&block_group->space_info->lock); + spin_lock(&block_group->lock); + bytes_used = block_group->key.offset - + btrfs_block_group_used(&block_group->item); + block_group->space_info->bytes_used += bytes_used >> 1; + spin_unlock(&block_group->lock); + spin_unlock(&block_group->space_info->lock); + fragment_free_space(extent_root, block_group); + } +#endif + + caching_ctl->progress = (u64)-1; err: btrfs_free_path(path); up_read(&fs_info->commit_root_sem); @@ -607,6 +654,22 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, } } spin_unlock(&cache->lock); +#ifdef CONFIG_BTRFS_DEBUG + if (ret == 1 && + btrfs_should_fragment_free_space(fs_info->extent_root, + cache)) { + u64 bytes_used; + + spin_lock(&cache->space_info->lock); + spin_lock(&cache->lock); + bytes_used = cache->key.offset - + btrfs_block_group_used(&cache->item); + cache->space_info->bytes_used += bytes_used >> 1; + spin_unlock(&cache->lock); + spin_unlock(&cache->space_info->lock); + fragment_free_space(fs_info->extent_root, cache); + } +#endif mutex_unlock(&caching_ctl->mutex); wake_up(&caching_ctl->wait); @@ -1316,8 +1379,7 @@ static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, return ret; } -static noinline u32 extent_data_ref_count(struct btrfs_root *root, - struct btrfs_path *path, +static noinline u32 extent_data_ref_count(struct btrfs_path *path, struct btrfs_extent_inline_ref *iref) { struct btrfs_key key; @@ -1883,10 +1945,77 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans, return ret; } -static int btrfs_issue_discard(struct block_device *bdev, - u64 start, u64 len) +#define in_range(b, first, len) ((b) >= (first) && (b) < (first) + (len)) +static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len, + u64 *discarded_bytes) { - return blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_NOFS, 0); + int j, ret = 0; + u64 bytes_left, end; + u64 aligned_start = ALIGN(start, 1 << 9); + + if (WARN_ON(start != aligned_start)) { + len -= aligned_start - start; + len = round_down(len, 1 << 9); + start = aligned_start; + } + + *discarded_bytes = 0; + + if (!len) + return 0; + + end = start + len; + bytes_left = len; + + /* Skip any superblocks on this device. */ + for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) { + u64 sb_start = btrfs_sb_offset(j); + u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE; + u64 size = sb_start - start; + + if (!in_range(sb_start, start, bytes_left) && + !in_range(sb_end, start, bytes_left) && + !in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE)) + continue; + + /* + * Superblock spans beginning of range. Adjust start and + * try again. + */ + if (sb_start <= start) { + start += sb_end - start; + if (start > end) { + bytes_left = 0; + break; + } + bytes_left = end - start; + continue; + } + + if (size) { + ret = blkdev_issue_discard(bdev, start >> 9, size >> 9, + GFP_NOFS, 0); + if (!ret) + *discarded_bytes += size; + else if (ret != -EOPNOTSUPP) + return ret; + } + + start = sb_end; + if (start > end) { + bytes_left = 0; + break; + } + bytes_left = end - start; + } + + if (bytes_left) { + ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9, + GFP_NOFS, 0); + if (!ret) + *discarded_bytes += bytes_left; + } + return ret; } int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, @@ -1907,14 +2036,16 @@ int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, for (i = 0; i < bbio->num_stripes; i++, stripe++) { + u64 bytes; if (!stripe->dev->can_discard) continue; ret = btrfs_issue_discard(stripe->dev->bdev, stripe->physical, - stripe->length); + stripe->length, + &bytes); if (!ret) - discarded_bytes += stripe->length; + discarded_bytes += bytes; else if (ret != -EOPNOTSUPP) break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */ @@ -1941,8 +2072,7 @@ int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, - u64 root_objectid, u64 owner, u64 offset, - int no_quota) + u64 root_objectid, u64 owner, u64 offset) { int ret; struct btrfs_fs_info *fs_info = root->fs_info; @@ -1954,12 +2084,12 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, num_bytes, parent, root_objectid, (int)owner, - BTRFS_ADD_DELAYED_REF, NULL, no_quota); + BTRFS_ADD_DELAYED_REF, NULL); } else { ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, - num_bytes, - parent, root_objectid, owner, offset, - BTRFS_ADD_DELAYED_REF, NULL, no_quota); + num_bytes, parent, root_objectid, + owner, offset, 0, + BTRFS_ADD_DELAYED_REF, NULL); } return ret; } @@ -1980,15 +2110,11 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, u64 num_bytes = node->num_bytes; u64 refs; int ret; - int no_quota = node->no_quota; path = btrfs_alloc_path(); if (!path) return -ENOMEM; - if (!is_fstree(root_objectid) || !root->fs_info->quota_enabled) - no_quota = 1; - path->reada = 1; path->leave_spinning = 1; /* this will setup the path even if it fails to insert the back ref */ @@ -2223,8 +2349,7 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, parent, ref_root, extent_op->flags_to_set, &extent_op->key, - ref->level, &ins, - node->no_quota); + ref->level, &ins); } else if (node->action == BTRFS_ADD_DELAYED_REF) { ret = __btrfs_inc_extent_ref(trans, root, node, parent, ref_root, @@ -2277,6 +2402,11 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, node->num_bytes); } } + + /* Also free its reserved qgroup space */ + btrfs_qgroup_free_delayed_ref(root->fs_info, + head->qgroup_ref_root, + head->qgroup_reserved); return ret; } @@ -2365,7 +2495,21 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, } } + /* + * We need to try and merge add/drops of the same ref since we + * can run into issues with relocate dropping the implicit ref + * and then it being added back again before the drop can + * finish. If we merged anything we need to re-loop so we can + * get a good ref. + * Or we can get node references of the same type that weren't + * merged when created due to bumps in the tree mod seq, and + * we need to merge them to prevent adding an inline extent + * backref before dropping it (triggering a BUG_ON at + * insert_inline_extent_backref()). + */ spin_lock(&locked_ref->lock); + btrfs_merge_delayed_refs(trans, fs_info, delayed_refs, + locked_ref); /* * locked_ref is the head node, so we have to go one @@ -2760,6 +2904,7 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *head; int ret; int run_all = count == (unsigned long)-1; + bool can_flush_pending_bgs = trans->can_flush_pending_bgs; /* We'll clean this up in btrfs_cleanup_transaction */ if (trans->aborted) @@ -2776,6 +2921,7 @@ again: #ifdef SCRAMBLE_DELAYED_REFS delayed_refs->run_delayed_start = find_middle(&delayed_refs->root); #endif + trans->can_flush_pending_bgs = false; ret = __btrfs_run_delayed_refs(trans, root, count); if (ret < 0) { btrfs_abort_transaction(trans, root, ret); @@ -2825,6 +2971,7 @@ again: } out: assert_qgroups_uptodate(trans); + trans->can_flush_pending_bgs = can_flush_pending_bgs; return 0; } @@ -3038,7 +3185,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, int level; int ret = 0; int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *, - u64, u64, u64, u64, u64, u64, int); + u64, u64, u64, u64, u64, u64); if (btrfs_test_is_dummy_root(root)) @@ -3079,15 +3226,14 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, key.offset -= btrfs_file_extent_offset(buf, fi); ret = process_func(trans, root, bytenr, num_bytes, parent, ref_root, key.objectid, - key.offset, 1); + key.offset); if (ret) goto fail; } else { bytenr = btrfs_node_blockptr(buf, i); num_bytes = root->nodesize; ret = process_func(trans, root, bytenr, num_bytes, - parent, ref_root, level - 1, 0, - 1); + parent, ref_root, level - 1, 0); if (ret) goto fail; } @@ -3268,6 +3414,15 @@ again: spin_unlock(&block_group->lock); /* + * We hit an ENOSPC when setting up the cache in this transaction, just + * skip doing the setup, we've already cleared the cache so we're safe. + */ + if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) { + ret = -ENOSPC; + goto out_put; + } + + /* * Try to preallocate enough space based on how big the block group is. * Keep in mind this has to include any pinned space which could end up * taking up quite a bit since it's not folded into the other space @@ -3280,16 +3435,26 @@ again: num_pages *= 16; num_pages *= PAGE_CACHE_SIZE; - ret = btrfs_check_data_free_space(inode, num_pages, num_pages); + ret = btrfs_check_data_free_space(inode, 0, num_pages); if (ret) goto out_put; ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages, num_pages, num_pages, &alloc_hint); + /* + * Our cache requires contiguous chunks so that we don't modify a bunch + * of metadata or split extents when writing the cache out, which means + * we can enospc if we are heavily fragmented in addition to just normal + * out of space conditions. So if we hit this just skip setting up any + * other block groups for this transaction, maybe we'll unpin enough + * space the next time around. + */ if (!ret) dcs = BTRFS_DC_SETUP; - btrfs_free_reserved_data_space(inode, num_pages); + else if (ret == -ENOSPC) + set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags); + btrfs_free_reserved_data_space(inode, 0, num_pages); out_put: iput(inode); @@ -3674,10 +3839,8 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, found->bytes_reserved = 0; found->bytes_readonly = 0; found->bytes_may_use = 0; - if (total_bytes > 0) - found->full = 0; - else - found->full = 1; + found->full = 0; + found->max_extent_size = 0; found->force_alloc = CHUNK_ALLOC_NO_FORCE; found->chunk_alloc = 0; found->flush = 0; @@ -3754,7 +3917,8 @@ static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) { u64 num_devices = root->fs_info->fs_devices->rw_devices; u64 target; - u64 tmp; + u64 raid_type; + u64 allowed = 0; /* * see if restripe for this chunk_type is in progress, if so @@ -3772,31 +3936,26 @@ static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) spin_unlock(&root->fs_info->balance_lock); /* First, mask out the RAID levels which aren't possible */ - if (num_devices == 1) - flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0 | - BTRFS_BLOCK_GROUP_RAID5); - if (num_devices < 3) - flags &= ~BTRFS_BLOCK_GROUP_RAID6; - if (num_devices < 4) - flags &= ~BTRFS_BLOCK_GROUP_RAID10; - - tmp = flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 | - BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID5 | - BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10); - flags &= ~tmp; - - if (tmp & BTRFS_BLOCK_GROUP_RAID6) - tmp = BTRFS_BLOCK_GROUP_RAID6; - else if (tmp & BTRFS_BLOCK_GROUP_RAID5) - tmp = BTRFS_BLOCK_GROUP_RAID5; - else if (tmp & BTRFS_BLOCK_GROUP_RAID10) - tmp = BTRFS_BLOCK_GROUP_RAID10; - else if (tmp & BTRFS_BLOCK_GROUP_RAID1) - tmp = BTRFS_BLOCK_GROUP_RAID1; - else if (tmp & BTRFS_BLOCK_GROUP_RAID0) - tmp = BTRFS_BLOCK_GROUP_RAID0; - - return extended_to_chunk(flags | tmp); + for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) { + if (num_devices >= btrfs_raid_array[raid_type].devs_min) + allowed |= btrfs_raid_group[raid_type]; + } + allowed &= flags; + + if (allowed & BTRFS_BLOCK_GROUP_RAID6) + allowed = BTRFS_BLOCK_GROUP_RAID6; + else if (allowed & BTRFS_BLOCK_GROUP_RAID5) + allowed = BTRFS_BLOCK_GROUP_RAID5; + else if (allowed & BTRFS_BLOCK_GROUP_RAID10) + allowed = BTRFS_BLOCK_GROUP_RAID10; + else if (allowed & BTRFS_BLOCK_GROUP_RAID1) + allowed = BTRFS_BLOCK_GROUP_RAID1; + else if (allowed & BTRFS_BLOCK_GROUP_RAID0) + allowed = BTRFS_BLOCK_GROUP_RAID0; + + flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK; + + return extended_to_chunk(flags | allowed); } static u64 get_alloc_profile(struct btrfs_root *root, u64 orig_flags) @@ -3835,11 +3994,7 @@ u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) return ret; } -/* - * This will check the space that the inode allocates from to make sure we have - * enough space for bytes. - */ -int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 write_bytes) +int btrfs_alloc_data_chunk_ondemand(struct inode *inode, u64 bytes) { struct btrfs_space_info *data_sinfo; struct btrfs_root *root = BTRFS_I(inode)->root; @@ -3938,7 +4093,8 @@ commit_trans: if (IS_ERR(trans)) return PTR_ERR(trans); if (have_pinned_space >= 0 || - trans->transaction->have_free_bgs || + test_bit(BTRFS_TRANS_HAVE_FREE_BGS, + &trans->transaction->flags) || need_commit > 0) { ret = btrfs_commit_transaction(trans, root); if (ret) @@ -3960,38 +4116,86 @@ commit_trans: data_sinfo->flags, bytes, 1); return -ENOSPC; } - ret = btrfs_qgroup_reserve(root, write_bytes); - if (ret) - goto out; data_sinfo->bytes_may_use += bytes; trace_btrfs_space_reservation(root->fs_info, "space_info", data_sinfo->flags, bytes, 1); -out: spin_unlock(&data_sinfo->lock); return ret; } /* - * Called if we need to clear a data reservation for this inode. + * New check_data_free_space() with ability for precious data reservation + * Will replace old btrfs_check_data_free_space(), but for patch split, + * add a new function first and then replace it. + */ +int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret; + + /* align the range */ + len = round_up(start + len, root->sectorsize) - + round_down(start, root->sectorsize); + start = round_down(start, root->sectorsize); + + ret = btrfs_alloc_data_chunk_ondemand(inode, len); + if (ret < 0) + return ret; + + /* + * Use new btrfs_qgroup_reserve_data to reserve precious data space + * + * TODO: Find a good method to avoid reserve data space for NOCOW + * range, but don't impact performance on quota disable case. + */ + ret = btrfs_qgroup_reserve_data(inode, start, len); + return ret; +} + +/* + * Called if we need to clear a data reservation for this inode + * Normally in a error case. + * + * This one will *NOT* use accurate qgroup reserved space API, just for case + * which we can't sleep and is sure it won't affect qgroup reserved space. + * Like clear_bit_hook(). */ -void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes) +void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, + u64 len) { struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_space_info *data_sinfo; - /* make sure bytes are sectorsize aligned */ - bytes = ALIGN(bytes, root->sectorsize); + /* Make sure the range is aligned to sectorsize */ + len = round_up(start + len, root->sectorsize) - + round_down(start, root->sectorsize); + start = round_down(start, root->sectorsize); data_sinfo = root->fs_info->data_sinfo; spin_lock(&data_sinfo->lock); - WARN_ON(data_sinfo->bytes_may_use < bytes); - data_sinfo->bytes_may_use -= bytes; + if (WARN_ON(data_sinfo->bytes_may_use < len)) + data_sinfo->bytes_may_use = 0; + else + data_sinfo->bytes_may_use -= len; trace_btrfs_space_reservation(root->fs_info, "space_info", - data_sinfo->flags, bytes, 0); + data_sinfo->flags, len, 0); spin_unlock(&data_sinfo->lock); } +/* + * Called if we need to clear a data reservation for this inode + * Normally in a error case. + * + * This one will handle the per-indoe data rsv map for accurate reserved + * space framework. + */ +void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len) +{ + btrfs_free_reserved_data_space_noquota(inode, start, len); + btrfs_qgroup_free_data(inode, start, len); +} + static void force_metadata_allocation(struct btrfs_fs_info *info) { struct list_head *head = &info->space_info; @@ -4241,7 +4445,8 @@ out: * the block groups that were made dirty during the lifetime of the * transaction. */ - if (trans->chunk_bytes_reserved >= (2 * 1024 * 1024ull)) { + if (trans->can_flush_pending_bgs && + trans->chunk_bytes_reserved >= (2 * 1024 * 1024ull)) { btrfs_create_pending_block_groups(trans, trans->root); btrfs_trans_release_chunk_metadata(trans); } @@ -4822,13 +5027,9 @@ static struct btrfs_block_rsv *get_block_rsv( { struct btrfs_block_rsv *block_rsv = NULL; - if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) - block_rsv = trans->block_rsv; - - if (root == root->fs_info->csum_root && trans->adding_csums) - block_rsv = trans->block_rsv; - - if (root == root->fs_info->uuid_root) + if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) || + (root == root->fs_info->csum_root && trans->adding_csums) || + (root == root->fs_info->uuid_root)) block_rsv = trans->block_rsv; if (!block_rsv) @@ -5271,7 +5472,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, if (root->fs_info->quota_enabled) { /* One for parent inode, two for dir entries */ num_bytes = 3 * root->nodesize; - ret = btrfs_qgroup_reserve(root, num_bytes); + ret = btrfs_qgroup_reserve_meta(root, num_bytes); if (ret) return ret; } else { @@ -5289,10 +5490,8 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, if (ret == -ENOSPC && use_global_rsv) ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes); - if (ret) { - if (*qgroup_reserved) - btrfs_qgroup_free(root, *qgroup_reserved); - } + if (ret && *qgroup_reserved) + btrfs_qgroup_free_meta(root, *qgroup_reserved); return ret; } @@ -5453,15 +5652,15 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) spin_unlock(&BTRFS_I(inode)->lock); if (root->fs_info->quota_enabled) { - ret = btrfs_qgroup_reserve(root, nr_extents * root->nodesize); + ret = btrfs_qgroup_reserve_meta(root, + nr_extents * root->nodesize); if (ret) goto out_fail; } ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush); if (unlikely(ret)) { - if (root->fs_info->quota_enabled) - btrfs_qgroup_free(root, nr_extents * root->nodesize); + btrfs_qgroup_free_meta(root, nr_extents * root->nodesize); goto out_fail; } @@ -5584,41 +5783,48 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) } /** - * btrfs_delalloc_reserve_space - reserve data and metadata space for delalloc + * btrfs_delalloc_reserve_space - reserve data and metadata space for + * delalloc * @inode: inode we're writing to - * @num_bytes: the number of bytes we want to allocate + * @start: start range we are writing to + * @len: how long the range we are writing to + * + * TODO: This function will finally replace old btrfs_delalloc_reserve_space() * * This will do the following things * - * o reserve space in the data space info for num_bytes - * o reserve space in the metadata space info based on number of outstanding + * o reserve space in data space info for num bytes + * and reserve precious corresponding qgroup space + * (Done in check_data_free_space) + * + * o reserve space for metadata space, based on the number of outstanding * extents and how much csums will be needed - * o add to the inodes ->delalloc_bytes + * also reserve metadata space in a per root over-reserve method. + * o add to the inodes->delalloc_bytes * o add it to the fs_info's delalloc inodes list. + * (Above 3 all done in delalloc_reserve_metadata) * - * This will return 0 for success and -ENOSPC if there is no space left. + * Return 0 for success + * Return <0 for error(-ENOSPC or -EQUOT) */ -int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes) +int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len) { int ret; - ret = btrfs_check_data_free_space(inode, num_bytes, num_bytes); - if (ret) - return ret; - - ret = btrfs_delalloc_reserve_metadata(inode, num_bytes); - if (ret) { - btrfs_free_reserved_data_space(inode, num_bytes); + ret = btrfs_check_data_free_space(inode, start, len); + if (ret < 0) return ret; - } - - return 0; + ret = btrfs_delalloc_reserve_metadata(inode, len); + if (ret < 0) + btrfs_free_reserved_data_space(inode, start, len); + return ret; } /** * btrfs_delalloc_release_space - release data and metadata space for delalloc * @inode: inode we're releasing space for - * @num_bytes: the number of bytes we want to free up + * @start: start position of the space already reserved + * @len: the len of the space already reserved * * This must be matched with a call to btrfs_delalloc_reserve_space. This is * called in the case that we don't need the metadata AND data reservations @@ -5627,11 +5833,12 @@ int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes) * This function will release the metadata space that was not used and will * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes * list if there are no delalloc bytes left. + * Also it will handle the qgroup reserved space. */ -void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes) +void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len) { - btrfs_delalloc_release_metadata(inode, num_bytes); - btrfs_free_reserved_data_space(inode, num_bytes); + btrfs_delalloc_release_metadata(inode, len); + btrfs_free_reserved_data_space(inode, start, len); } static int update_block_group(struct btrfs_trans_handle *trans, @@ -5996,6 +6203,34 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, update_global_block_rsv(fs_info); } +/* + * Returns the free cluster for the given space info and sets empty_cluster to + * what it should be based on the mount options. + */ +static struct btrfs_free_cluster * +fetch_cluster_info(struct btrfs_root *root, struct btrfs_space_info *space_info, + u64 *empty_cluster) +{ + struct btrfs_free_cluster *ret = NULL; + bool ssd = btrfs_test_opt(root, SSD); + + *empty_cluster = 0; + if (btrfs_mixed_space_info(space_info)) + return ret; + + if (ssd) + *empty_cluster = 2 * 1024 * 1024; + if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { + ret = &root->fs_info->meta_alloc_cluster; + if (!ssd) + *empty_cluster = 64 * 1024; + } else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) && ssd) { + ret = &root->fs_info->data_alloc_cluster; + } + + return ret; +} + static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end, const bool return_free_space) { @@ -6003,7 +6238,10 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end, struct btrfs_block_group_cache *cache = NULL; struct btrfs_space_info *space_info; struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; + struct btrfs_free_cluster *cluster = NULL; u64 len; + u64 total_unpinned = 0; + u64 empty_cluster = 0; bool readonly; while (start <= end) { @@ -6012,8 +6250,14 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end, start >= cache->key.objectid + cache->key.offset) { if (cache) btrfs_put_block_group(cache); + total_unpinned = 0; cache = btrfs_lookup_block_group(fs_info, start); BUG_ON(!cache); /* Logic error */ + + cluster = fetch_cluster_info(root, + cache->space_info, + &empty_cluster); + empty_cluster <<= 1; } len = cache->key.objectid + cache->key.offset - start; @@ -6026,12 +6270,27 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end, } start += len; + total_unpinned += len; space_info = cache->space_info; + /* + * If this space cluster has been marked as fragmented and we've + * unpinned enough in this block group to potentially allow a + * cluster to be created inside of it go ahead and clear the + * fragmented check. + */ + if (cluster && cluster->fragmented && + total_unpinned > empty_cluster) { + spin_lock(&cluster->lock); + cluster->fragmented = 0; + spin_unlock(&cluster->lock); + } + spin_lock(&space_info->lock); spin_lock(&cache->lock); cache->pinned -= len; space_info->bytes_pinned -= len; + space_info->max_extent_size = 0; percpu_counter_add(&space_info->total_bytes_pinned, -len); if (cache->ro) { space_info->bytes_readonly += len; @@ -6062,20 +6321,19 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_block_group_cache *block_group, *tmp; + struct list_head *deleted_bgs; struct extent_io_tree *unpin; u64 start; u64 end; int ret; - if (trans->aborted) - return 0; - if (fs_info->pinned_extents == &fs_info->freed_extents[0]) unpin = &fs_info->freed_extents[1]; else unpin = &fs_info->freed_extents[0]; - while (1) { + while (!trans->aborted) { mutex_lock(&fs_info->unused_bg_unpin_mutex); ret = find_first_extent_bit(unpin, 0, &start, &end, EXTENT_DIRTY, NULL); @@ -6094,6 +6352,34 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, cond_resched(); } + /* + * Transaction is finished. We don't need the lock anymore. We + * do need to clean up the block groups in case of a transaction + * abort. + */ + deleted_bgs = &trans->transaction->deleted_bgs; + list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) { + u64 trimmed = 0; + + ret = -EROFS; + if (!trans->aborted) + ret = btrfs_discard_extent(root, + block_group->key.objectid, + block_group->key.offset, + &trimmed); + + list_del_init(&block_group->bg_list); + btrfs_put_block_group_trimming(block_group); + btrfs_put_block_group(block_group); + + if (ret) { + const char *errstr = btrfs_decode_error(ret); + btrfs_warn(fs_info, + "Discard failed while removing blockgroup: errno=%d %s\n", + ret, errstr); + } + } + return 0; } @@ -6137,7 +6423,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, int extent_slot = 0; int found_extent = 0; int num_to_del = 1; - int no_quota = node->no_quota; u32 item_size; u64 refs; u64 bytenr = node->bytenr; @@ -6146,9 +6431,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, bool skinny_metadata = btrfs_fs_incompat(root->fs_info, SKINNY_METADATA); - if (!info->quota_enabled || !is_fstree(root_objectid)) - no_quota = 1; - path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -6349,7 +6631,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } else { if (found_extent) { BUG_ON(is_data && refs_to_drop != - extent_data_ref_count(root, path, iref)); + extent_data_ref_count(path, iref)); if (iref) { BUG_ON(path->slots[0] != extent_slot); } else { @@ -6474,7 +6756,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, buf->start, buf->len, parent, root->root_key.objectid, btrfs_header_level(buf), - BTRFS_DROP_DELAYED_REF, NULL, 0); + BTRFS_DROP_DELAYED_REF, NULL); BUG_ON(ret); /* -ENOMEM */ } @@ -6522,7 +6804,7 @@ out: /* Can return -ENOMEM */ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, - u64 owner, u64 offset, int no_quota) + u64 owner, u64 offset) { int ret; struct btrfs_fs_info *fs_info = root->fs_info; @@ -6545,13 +6827,13 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, num_bytes, parent, root_objectid, (int)owner, - BTRFS_DROP_DELAYED_REF, NULL, no_quota); + BTRFS_DROP_DELAYED_REF, NULL); } else { ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, num_bytes, parent, root_objectid, owner, - offset, BTRFS_DROP_DELAYED_REF, - NULL, no_quota); + offset, 0, + BTRFS_DROP_DELAYED_REF, NULL); } return ret; } @@ -6737,7 +7019,7 @@ static noinline int find_free_extent(struct btrfs_root *orig_root, struct btrfs_block_group_cache *block_group = NULL; u64 search_start = 0; u64 max_extent_size = 0; - int empty_cluster = 2 * 1024 * 1024; + u64 empty_cluster = 0; struct btrfs_space_info *space_info; int loop = 0; int index = __get_raid_index(flags); @@ -6747,6 +7029,8 @@ static noinline int find_free_extent(struct btrfs_root *orig_root, bool failed_alloc = false; bool use_cluster = true; bool have_caching_bg = false; + bool orig_have_caching_bg = false; + bool full_search = false; WARN_ON(num_bytes < root->sectorsize); ins->type = BTRFS_EXTENT_ITEM_KEY; @@ -6762,36 +7046,47 @@ static noinline int find_free_extent(struct btrfs_root *orig_root, } /* - * If the space info is for both data and metadata it means we have a - * small filesystem and we can't use the clustering stuff. + * If our free space is heavily fragmented we may not be able to make + * big contiguous allocations, so instead of doing the expensive search + * for free space, simply return ENOSPC with our max_extent_size so we + * can go ahead and search for a more manageable chunk. + * + * If our max_extent_size is large enough for our allocation simply + * disable clustering since we will likely not be able to find enough + * space to create a cluster and induce latency trying. */ - if (btrfs_mixed_space_info(space_info)) - use_cluster = false; - - if (flags & BTRFS_BLOCK_GROUP_METADATA && use_cluster) { - last_ptr = &root->fs_info->meta_alloc_cluster; - if (!btrfs_test_opt(root, SSD)) - empty_cluster = 64 * 1024; - } - - if ((flags & BTRFS_BLOCK_GROUP_DATA) && use_cluster && - btrfs_test_opt(root, SSD)) { - last_ptr = &root->fs_info->data_alloc_cluster; + if (unlikely(space_info->max_extent_size)) { + spin_lock(&space_info->lock); + if (space_info->max_extent_size && + num_bytes > space_info->max_extent_size) { + ins->offset = space_info->max_extent_size; + spin_unlock(&space_info->lock); + return -ENOSPC; + } else if (space_info->max_extent_size) { + use_cluster = false; + } + spin_unlock(&space_info->lock); } + last_ptr = fetch_cluster_info(orig_root, space_info, &empty_cluster); if (last_ptr) { spin_lock(&last_ptr->lock); if (last_ptr->block_group) hint_byte = last_ptr->window_start; + if (last_ptr->fragmented) { + /* + * We still set window_start so we can keep track of the + * last place we found an allocation to try and save + * some time. + */ + hint_byte = last_ptr->window_start; + use_cluster = false; + } spin_unlock(&last_ptr->lock); } search_start = max(search_start, first_logical_byte(root, 0)); search_start = max(search_start, hint_byte); - - if (!last_ptr) - empty_cluster = 0; - if (search_start == hint_byte) { block_group = btrfs_lookup_block_group(root->fs_info, search_start); @@ -6826,6 +7121,8 @@ static noinline int find_free_extent(struct btrfs_root *orig_root, } search: have_caching_bg = false; + if (index == 0 || index == __get_raid_index(flags)) + full_search = true; down_read(&space_info->groups_sem); list_for_each_entry(block_group, &space_info->block_groups[index], list) { @@ -6859,6 +7156,7 @@ search: have_block_group: cached = block_group_cache_done(block_group); if (unlikely(!cached)) { + have_caching_bg = true; ret = cache_block_group(block_group, 0); BUG_ON(ret < 0); ret = 0; @@ -6873,7 +7171,7 @@ have_block_group: * Ok we want to try and use the cluster allocator, so * lets look there */ - if (last_ptr) { + if (last_ptr && use_cluster) { struct btrfs_block_group_cache *used_block_group; unsigned long aligned_cluster; /* @@ -6999,6 +7297,16 @@ refill_cluster: } unclustered_alloc: + /* + * We are doing an unclustered alloc, set the fragmented flag so + * we don't bother trying to setup a cluster again until we get + * more space. + */ + if (unlikely(last_ptr)) { + spin_lock(&last_ptr->lock); + last_ptr->fragmented = 1; + spin_unlock(&last_ptr->lock); + } spin_lock(&block_group->free_space_ctl->tree_lock); if (cached && block_group->free_space_ctl->free_space < @@ -7031,8 +7339,6 @@ unclustered_alloc: failed_alloc = true; goto have_block_group; } else if (!offset) { - if (!cached) - have_caching_bg = true; goto loop; } checks: @@ -7073,6 +7379,10 @@ loop: } up_read(&space_info->groups_sem); + if ((loop == LOOP_CACHING_NOWAIT) && have_caching_bg + && !orig_have_caching_bg) + orig_have_caching_bg = true; + if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg) goto search; @@ -7089,7 +7399,20 @@ loop: */ if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) { index = 0; - loop++; + if (loop == LOOP_CACHING_NOWAIT) { + /* + * We want to skip the LOOP_CACHING_WAIT step if we + * don't have any unached bgs and we've alrelady done a + * full search through. + */ + if (orig_have_caching_bg || !full_search) + loop = LOOP_CACHING_WAIT; + else + loop = LOOP_ALLOC_CHUNK; + } else { + loop++; + } + if (loop == LOOP_ALLOC_CHUNK) { struct btrfs_trans_handle *trans; int exist = 0; @@ -7107,6 +7430,15 @@ loop: ret = do_chunk_alloc(trans, root, flags, CHUNK_ALLOC_FORCE); + + /* + * If we can't allocate a new chunk we've already looped + * through at least once, move on to the NO_EMPTY_SIZE + * case. + */ + if (ret == -ENOSPC) + loop = LOOP_NO_EMPTY_SIZE; + /* * Do not bail out on ENOSPC since we * can do more things. @@ -7123,6 +7455,15 @@ loop: } if (loop == LOOP_NO_EMPTY_SIZE) { + /* + * Don't loop again if we already have no empty_size and + * no empty_cluster. + */ + if (empty_size == 0 && + empty_cluster == 0) { + ret = -ENOSPC; + goto out; + } empty_size = 0; empty_cluster = 0; } @@ -7131,11 +7472,20 @@ loop: } else if (!ins->objectid) { ret = -ENOSPC; } else if (ins->objectid) { + if (!use_cluster && last_ptr) { + spin_lock(&last_ptr->lock); + last_ptr->window_start = ins->objectid; + spin_unlock(&last_ptr->lock); + } ret = 0; } out: - if (ret == -ENOSPC) + if (ret == -ENOSPC) { + spin_lock(&space_info->lock); + space_info->max_extent_size = max_extent_size; + spin_unlock(&space_info->lock); ins->offset = max_extent_size; + } return ret; } @@ -7184,7 +7534,7 @@ int btrfs_reserve_extent(struct btrfs_root *root, u64 empty_size, u64 hint_byte, struct btrfs_key *ins, int is_data, int delalloc) { - bool final_tried = false; + bool final_tried = num_bytes == min_alloc_size; u64 flags; int ret; @@ -7333,8 +7683,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 parent, u64 root_objectid, u64 flags, struct btrfs_disk_key *key, - int level, struct btrfs_key *ins, - int no_quota) + int level, struct btrfs_key *ins) { int ret; struct btrfs_fs_info *fs_info = root->fs_info; @@ -7415,7 +7764,8 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 root_objectid, u64 owner, - u64 offset, struct btrfs_key *ins) + u64 offset, u64 ram_bytes, + struct btrfs_key *ins) { int ret; @@ -7424,7 +7774,8 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, ret = btrfs_add_delayed_data_ref(root->fs_info, trans, ins->objectid, ins->offset, 0, root_objectid, owner, offset, - BTRFS_ADD_DELAYED_EXTENT, NULL, 0); + ram_bytes, BTRFS_ADD_DELAYED_EXTENT, + NULL); return ret; } @@ -7567,9 +7918,6 @@ static void unuse_block_rsv(struct btrfs_fs_info *fs_info, /* * finds a free extent and does all the dirty work required for allocation - * returns the key for the extent through ins, and a tree buffer for - * the first block of the extent through buf. - * * returns the tree buffer or an ERR_PTR on error. */ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, @@ -7641,7 +7989,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, ins.objectid, ins.offset, parent, root_objectid, level, BTRFS_ADD_DELAYED_EXTENT, - extent_op, 0); + extent_op); if (ret) goto out_free_delayed; } @@ -8182,14 +8530,15 @@ skip: ret = account_shared_subtree(trans, root, next, generation, level - 1); if (ret) { - printk_ratelimited(KERN_ERR "BTRFS: %s Error " + btrfs_err_rl(root->fs_info, + "Error " "%d accounting shared subtree. Quota " - "is out of sync, rescan required.\n", - root->fs_info->sb->s_id, ret); + "is out of sync, rescan required.", + ret); } } ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent, - root->root_key.objectid, level - 1, 0, 0); + root->root_key.objectid, level - 1, 0); BUG_ON(ret); /* -ENOMEM */ } btrfs_tree_unlock(next); @@ -8274,10 +8623,11 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, BUG_ON(ret); /* -ENOMEM */ ret = account_leaf_items(trans, root, eb); if (ret) { - printk_ratelimited(KERN_ERR "BTRFS: %s Error " + btrfs_err_rl(root->fs_info, + "error " "%d accounting leaf items. Quota " - "is out of sync, rescan required.\n", - root->fs_info->sb->s_id, ret); + "is out of sync, rescan required.", + ret); } } /* make block locked assertion in clean_tree_block happy */ @@ -8576,7 +8926,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, } if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) { - btrfs_drop_and_free_fs_root(tree_root->fs_info, root); + btrfs_add_dropped_root(trans, root); } else { free_extent_buffer(root->node); free_extent_buffer(root->commit_root); @@ -8599,7 +8949,7 @@ out: if (!for_reloc && root_dropped == false) btrfs_add_dead_root(root); if (err && err != -EAGAIN) - btrfs_std_error(root->fs_info, err); + btrfs_std_error(root->fs_info, err, NULL); return err; } @@ -8723,14 +9073,13 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags) return flags; } -static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force) +static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force) { struct btrfs_space_info *sinfo = cache->space_info; u64 num_bytes; u64 min_allocable_bytes; int ret = -ENOSPC; - /* * We need some metadata space and system metadata space for * allocating chunks in some corner cases until we force to set @@ -8747,6 +9096,7 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force) spin_lock(&cache->lock); if (cache->ro) { + cache->ro++; ret = 0; goto out; } @@ -8758,7 +9108,7 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force) sinfo->bytes_may_use + sinfo->bytes_readonly + num_bytes + min_allocable_bytes <= sinfo->total_bytes) { sinfo->bytes_readonly += num_bytes; - cache->ro = 1; + cache->ro++; list_add_tail(&cache->ro_list, &sinfo->ro_bgs); ret = 0; } @@ -8768,7 +9118,7 @@ out: return ret; } -int btrfs_set_block_group_ro(struct btrfs_root *root, +int btrfs_inc_block_group_ro(struct btrfs_root *root, struct btrfs_block_group_cache *cache) { @@ -8776,8 +9126,6 @@ int btrfs_set_block_group_ro(struct btrfs_root *root, u64 alloc_flags; int ret; - BUG_ON(cache->ro); - again: trans = btrfs_join_transaction(root); if (IS_ERR(trans)) @@ -8789,7 +9137,7 @@ again: * back off and let this transaction commit */ mutex_lock(&root->fs_info->ro_block_group_mutex); - if (trans->transaction->dirty_bg_run) { + if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) { u64 transid = trans->transid; mutex_unlock(&root->fs_info->ro_block_group_mutex); @@ -8820,7 +9168,7 @@ again: goto out; } - ret = set_block_group_ro(cache, 0); + ret = inc_block_group_ro(cache, 0); if (!ret) goto out; alloc_flags = get_alloc_profile(root, cache->space_info->flags); @@ -8828,7 +9176,7 @@ again: CHUNK_ALLOC_FORCE); if (ret < 0) goto out; - ret = set_block_group_ro(cache, 0); + ret = inc_block_group_ro(cache, 0); out: if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) { alloc_flags = update_block_group_flags(root, cache->flags); @@ -8891,7 +9239,7 @@ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo) return free_bytes; } -void btrfs_set_block_group_rw(struct btrfs_root *root, +void btrfs_dec_block_group_ro(struct btrfs_root *root, struct btrfs_block_group_cache *cache) { struct btrfs_space_info *sinfo = cache->space_info; @@ -8901,11 +9249,13 @@ void btrfs_set_block_group_rw(struct btrfs_root *root, spin_lock(&sinfo->lock); spin_lock(&cache->lock); - num_bytes = cache->key.offset - cache->reserved - cache->pinned - - cache->bytes_super - btrfs_block_group_used(&cache->item); - sinfo->bytes_readonly -= num_bytes; - cache->ro = 0; - list_del_init(&cache->ro_list); + if (!--cache->ro) { + num_bytes = cache->key.offset - cache->reserved - + cache->pinned - cache->bytes_super - + btrfs_block_group_used(&cache->item); + sinfo->bytes_readonly -= num_bytes; + list_del_init(&cache->ro_list); + } spin_unlock(&cache->lock); spin_unlock(&sinfo->lock); } @@ -9421,7 +9771,7 @@ int btrfs_read_block_groups(struct btrfs_root *root) set_avail_alloc_bits(root->fs_info, cache->flags); if (btrfs_chunk_readonly(root, cache->key.objectid)) { - set_block_group_ro(cache, 1); + inc_block_group_ro(cache, 1); } else if (btrfs_block_group_used(&cache->item) == 0) { spin_lock(&info->unused_bgs_lock); /* Should always be true but just in case. */ @@ -9449,11 +9799,11 @@ int btrfs_read_block_groups(struct btrfs_root *root) list_for_each_entry(cache, &space_info->block_groups[BTRFS_RAID_RAID0], list) - set_block_group_ro(cache, 1); + inc_block_group_ro(cache, 1); list_for_each_entry(cache, &space_info->block_groups[BTRFS_RAID_SINGLE], list) - set_block_group_ro(cache, 1); + inc_block_group_ro(cache, 1); } init_global_block_rsv(info); @@ -9471,7 +9821,9 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, struct btrfs_block_group_item item; struct btrfs_key key; int ret = 0; + bool can_flush_pending_bgs = trans->can_flush_pending_bgs; + trans->can_flush_pending_bgs = false; list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) { if (ret) goto next; @@ -9492,6 +9844,7 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, next: list_del_init(&block_group->bg_list); } + trans->can_flush_pending_bgs = can_flush_pending_bgs; } int btrfs_make_block_group(struct btrfs_trans_handle *trans, @@ -9534,6 +9887,14 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, free_excluded_extents(root, cache); +#ifdef CONFIG_BTRFS_DEBUG + if (btrfs_should_fragment_free_space(root, cache)) { + u64 new_bytes_used = size - bytes_used; + + bytes_used += new_bytes_used >> 1; + fragment_free_space(root, cache); + } +#endif /* * Call to ensure the corresponding space_info object is created and * assigned to our block group, but don't update its counters just yet. @@ -9834,6 +10195,11 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, * currently running transaction might finish and a new one start, * allowing for new block groups to be created that can reuse the same * physical device locations unless we take this special care. + * + * There may also be an implicit trim operation if the file system + * is mounted with -odiscard. The same protections must remain + * in place until the extents have been discarded completely when + * the transaction commit has completed. */ remove_em = (atomic_read(&block_group->trimming) == 0); /* @@ -9908,6 +10274,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) spin_lock(&fs_info->unused_bgs_lock); while (!list_empty(&fs_info->unused_bgs)) { u64 start, end; + int trimming; block_group = list_first_entry(&fs_info->unused_bgs, struct btrfs_block_group_cache, @@ -9941,7 +10308,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) spin_unlock(&block_group->lock); /* We don't want to force the issue, only flip if it's ok. */ - ret = set_block_group_ro(block_group, 0); + ret = inc_block_group_ro(block_group, 0); up_write(&space_info->groups_sem); if (ret < 0) { ret = 0; @@ -9955,7 +10322,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) /* 1 for btrfs_orphan_reserve_metadata() */ trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { - btrfs_set_block_group_rw(root, block_group); + btrfs_dec_block_group_ro(root, block_group); ret = PTR_ERR(trans); goto next; } @@ -9982,14 +10349,14 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) EXTENT_DIRTY, GFP_NOFS); if (ret) { mutex_unlock(&fs_info->unused_bg_unpin_mutex); - btrfs_set_block_group_rw(root, block_group); + btrfs_dec_block_group_ro(root, block_group); goto end_trans; } ret = clear_extent_bits(&fs_info->freed_extents[1], start, end, EXTENT_DIRTY, GFP_NOFS); if (ret) { mutex_unlock(&fs_info->unused_bg_unpin_mutex); - btrfs_set_block_group_rw(root, block_group); + btrfs_dec_block_group_ro(root, block_group); goto end_trans; } mutex_unlock(&fs_info->unused_bg_unpin_mutex); @@ -10007,12 +10374,39 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) spin_unlock(&block_group->lock); spin_unlock(&space_info->lock); + /* DISCARD can flip during remount */ + trimming = btrfs_test_opt(root, DISCARD); + + /* Implicit trim during transaction commit. */ + if (trimming) + btrfs_get_block_group_trimming(block_group); + /* * Btrfs_remove_chunk will abort the transaction if things go * horribly wrong. */ ret = btrfs_remove_chunk(trans, root, block_group->key.objectid); + + if (ret) { + if (trimming) + btrfs_put_block_group_trimming(block_group); + goto end_trans; + } + + /* + * If we're not mounted with -odiscard, we can just forget + * about this block group. Otherwise we'll need to wait + * until transaction commit to do the actual discard. + */ + if (trimming) { + WARN_ON(!list_empty(&block_group->bg_list)); + spin_lock(&trans->transaction->deleted_bgs_lock); + list_move(&block_group->bg_list, + &trans->transaction->deleted_bgs); + spin_unlock(&trans->transaction->deleted_bgs_lock); + btrfs_get_block_group(block_group); + } end_trans: btrfs_end_transaction(trans, root); next: @@ -10066,10 +10460,99 @@ int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) return unpin_extent_range(root, start, end, false); } +/* + * It used to be that old block groups would be left around forever. + * Iterating over them would be enough to trim unused space. Since we + * now automatically remove them, we also need to iterate over unallocated + * space. + * + * We don't want a transaction for this since the discard may take a + * substantial amount of time. We don't require that a transaction be + * running, but we do need to take a running transaction into account + * to ensure that we're not discarding chunks that were released in + * the current transaction. + * + * Holding the chunks lock will prevent other threads from allocating + * or releasing chunks, but it won't prevent a running transaction + * from committing and releasing the memory that the pending chunks + * list head uses. For that, we need to take a reference to the + * transaction. + */ +static int btrfs_trim_free_extents(struct btrfs_device *device, + u64 minlen, u64 *trimmed) +{ + u64 start = 0, len = 0; + int ret; + + *trimmed = 0; + + /* Not writeable = nothing to do. */ + if (!device->writeable) + return 0; + + /* No free space = nothing to do. */ + if (device->total_bytes <= device->bytes_used) + return 0; + + ret = 0; + + while (1) { + struct btrfs_fs_info *fs_info = device->dev_root->fs_info; + struct btrfs_transaction *trans; + u64 bytes; + + ret = mutex_lock_interruptible(&fs_info->chunk_mutex); + if (ret) + return ret; + + down_read(&fs_info->commit_root_sem); + + spin_lock(&fs_info->trans_lock); + trans = fs_info->running_transaction; + if (trans) + atomic_inc(&trans->use_count); + spin_unlock(&fs_info->trans_lock); + + ret = find_free_dev_extent_start(trans, device, minlen, start, + &start, &len); + if (trans) + btrfs_put_transaction(trans); + + if (ret) { + up_read(&fs_info->commit_root_sem); + mutex_unlock(&fs_info->chunk_mutex); + if (ret == -ENOSPC) + ret = 0; + break; + } + + ret = btrfs_issue_discard(device->bdev, start, len, &bytes); + up_read(&fs_info->commit_root_sem); + mutex_unlock(&fs_info->chunk_mutex); + + if (ret) + break; + + start += len; + *trimmed += bytes; + + if (fatal_signal_pending(current)) { + ret = -ERESTARTSYS; + break; + } + + cond_resched(); + } + + return ret; +} + int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_block_group_cache *cache = NULL; + struct btrfs_device *device; + struct list_head *devices; u64 group_trimmed; u64 start; u64 end; @@ -10124,6 +10607,18 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range) cache = next_block_group(fs_info->tree_root, cache); } + mutex_lock(&root->fs_info->fs_devices->device_list_mutex); + devices = &root->fs_info->fs_devices->alloc_list; + list_for_each_entry(device, devices, dev_alloc_list) { + ret = btrfs_trim_free_extents(device, range->minlen, + &group_trimmed); + if (ret) + break; + + trimmed += group_trimmed; + } + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); + range->len = trimmed; return ret; } @@ -10140,8 +10635,7 @@ void btrfs_end_write_no_snapshoting(struct btrfs_root *root) { percpu_counter_dec(&root->subv_writers->counter); /* - * Make sure counter is updated before we wake up - * waiters. + * Make sure counter is updated before we wake up waiters. */ smp_mb(); if (waitqueue_active(&root->subv_writers->wait)) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 02d05817cbdf..33a01ea41465 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -96,8 +96,8 @@ static inline void __btrfs_debug_check_extent_io_range(const char *caller, inode = tree->mapping->host; isize = i_size_read(inode); if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) { - printk_ratelimited(KERN_DEBUG - "BTRFS: %s: ino %llu isize %llu odd range [%llu,%llu]\n", + btrfs_debug_rl(BTRFS_I(inode)->root->fs_info, + "%s: ino %llu isize %llu odd range [%llu,%llu]", caller, btrfs_ino(inode), isize, start, end); } } @@ -131,6 +131,25 @@ struct extent_page_data { unsigned int sync_io:1; }; +static void add_extent_changeset(struct extent_state *state, unsigned bits, + struct extent_changeset *changeset, + int set) +{ + int ret; + + if (!changeset) + return; + if (set && (state->state & bits) == bits) + return; + if (!set && (state->state & bits) == 0) + return; + changeset->bytes_changed += state->end - state->start + 1; + ret = ulist_add(changeset->range_changed, state->start, state->end, + GFP_ATOMIC); + /* ENOMEM */ + BUG_ON(ret < 0); +} + static noinline void flush_write_bio(void *data); static inline struct btrfs_fs_info * tree_fs_info(struct extent_io_tree *tree) @@ -410,7 +429,8 @@ static void clear_state_cb(struct extent_io_tree *tree, } static void set_state_bits(struct extent_io_tree *tree, - struct extent_state *state, unsigned *bits); + struct extent_state *state, unsigned *bits, + struct extent_changeset *changeset); /* * insert an extent_state struct into the tree. 'bits' are set on the @@ -426,7 +446,7 @@ static int insert_state(struct extent_io_tree *tree, struct extent_state *state, u64 start, u64 end, struct rb_node ***p, struct rb_node **parent, - unsigned *bits) + unsigned *bits, struct extent_changeset *changeset) { struct rb_node *node; @@ -436,7 +456,7 @@ static int insert_state(struct extent_io_tree *tree, state->start = start; state->end = end; - set_state_bits(tree, state, bits); + set_state_bits(tree, state, bits, changeset); node = tree_insert(&tree->state, NULL, end, &state->rb_node, p, parent); if (node) { @@ -511,7 +531,8 @@ static struct extent_state *next_state(struct extent_state *state) */ static struct extent_state *clear_state_bit(struct extent_io_tree *tree, struct extent_state *state, - unsigned *bits, int wake) + unsigned *bits, int wake, + struct extent_changeset *changeset) { struct extent_state *next; unsigned bits_to_clear = *bits & ~EXTENT_CTLBITS; @@ -522,6 +543,7 @@ static struct extent_state *clear_state_bit(struct extent_io_tree *tree, tree->dirty_bytes -= range; } clear_state_cb(tree, state, bits); + add_extent_changeset(state, bits_to_clear, changeset, 0); state->state &= ~bits_to_clear; if (wake) wake_up(&state->wq); @@ -569,10 +591,10 @@ static void extent_io_tree_panic(struct extent_io_tree *tree, int err) * * This takes the tree lock, and returns 0 on success and < 0 on error. */ -int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, int wake, int delete, - struct extent_state **cached_state, - gfp_t mask) +static int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + unsigned bits, int wake, int delete, + struct extent_state **cached_state, + gfp_t mask, struct extent_changeset *changeset) { struct extent_state *state; struct extent_state *cached; @@ -671,7 +693,8 @@ hit_next: if (err) goto out; if (state->end <= end) { - state = clear_state_bit(tree, state, &bits, wake); + state = clear_state_bit(tree, state, &bits, wake, + changeset); goto next; } goto search_again; @@ -692,13 +715,13 @@ hit_next: if (wake) wake_up(&state->wq); - clear_state_bit(tree, prealloc, &bits, wake); + clear_state_bit(tree, prealloc, &bits, wake, changeset); prealloc = NULL; goto out; } - state = clear_state_bit(tree, state, &bits, wake); + state = clear_state_bit(tree, state, &bits, wake, changeset); next: if (last_end == (u64)-1) goto out; @@ -789,7 +812,7 @@ out: static void set_state_bits(struct extent_io_tree *tree, struct extent_state *state, - unsigned *bits) + unsigned *bits, struct extent_changeset *changeset) { unsigned bits_to_set = *bits & ~EXTENT_CTLBITS; @@ -798,6 +821,7 @@ static void set_state_bits(struct extent_io_tree *tree, u64 range = state->end - state->start + 1; tree->dirty_bytes += range; } + add_extent_changeset(state, bits_to_set, changeset, 1); state->state |= bits_to_set; } @@ -835,7 +859,7 @@ static int __must_check __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, unsigned exclusive_bits, u64 *failed_start, struct extent_state **cached_state, - gfp_t mask) + gfp_t mask, struct extent_changeset *changeset) { struct extent_state *state; struct extent_state *prealloc = NULL; @@ -873,7 +897,7 @@ again: prealloc = alloc_extent_state_atomic(prealloc); BUG_ON(!prealloc); err = insert_state(tree, prealloc, start, end, - &p, &parent, &bits); + &p, &parent, &bits, changeset); if (err) extent_io_tree_panic(tree, err); @@ -899,7 +923,7 @@ hit_next: goto out; } - set_state_bits(tree, state, &bits); + set_state_bits(tree, state, &bits, changeset); cache_state(state, cached_state); merge_state(tree, state); if (last_end == (u64)-1) @@ -945,7 +969,7 @@ hit_next: if (err) goto out; if (state->end <= end) { - set_state_bits(tree, state, &bits); + set_state_bits(tree, state, &bits, changeset); cache_state(state, cached_state); merge_state(tree, state); if (last_end == (u64)-1) @@ -980,7 +1004,7 @@ hit_next: * the later extent. */ err = insert_state(tree, prealloc, start, this_end, - NULL, NULL, &bits); + NULL, NULL, &bits, changeset); if (err) extent_io_tree_panic(tree, err); @@ -1008,7 +1032,7 @@ hit_next: if (err) extent_io_tree_panic(tree, err); - set_state_bits(tree, prealloc, &bits); + set_state_bits(tree, prealloc, &bits, changeset); cache_state(prealloc, cached_state); merge_state(tree, prealloc); prealloc = NULL; @@ -1038,7 +1062,7 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached_state, gfp_t mask) { return __set_extent_bit(tree, start, end, bits, 0, failed_start, - cached_state, mask); + cached_state, mask, NULL); } @@ -1111,7 +1135,7 @@ again: goto out; } err = insert_state(tree, prealloc, start, end, - &p, &parent, &bits); + &p, &parent, &bits, NULL); if (err) extent_io_tree_panic(tree, err); cache_state(prealloc, cached_state); @@ -1130,9 +1154,9 @@ hit_next: * Just lock what we found and keep going */ if (state->start == start && state->end <= end) { - set_state_bits(tree, state, &bits); + set_state_bits(tree, state, &bits, NULL); cache_state(state, cached_state); - state = clear_state_bit(tree, state, &clear_bits, 0); + state = clear_state_bit(tree, state, &clear_bits, 0, NULL); if (last_end == (u64)-1) goto out; start = last_end + 1; @@ -1171,9 +1195,10 @@ hit_next: if (err) goto out; if (state->end <= end) { - set_state_bits(tree, state, &bits); + set_state_bits(tree, state, &bits, NULL); cache_state(state, cached_state); - state = clear_state_bit(tree, state, &clear_bits, 0); + state = clear_state_bit(tree, state, &clear_bits, 0, + NULL); if (last_end == (u64)-1) goto out; start = last_end + 1; @@ -1208,7 +1233,7 @@ hit_next: * the later extent. */ err = insert_state(tree, prealloc, start, this_end, - NULL, NULL, &bits); + NULL, NULL, &bits, NULL); if (err) extent_io_tree_panic(tree, err); cache_state(prealloc, cached_state); @@ -1233,9 +1258,9 @@ hit_next: if (err) extent_io_tree_panic(tree, err); - set_state_bits(tree, prealloc, &bits); + set_state_bits(tree, prealloc, &bits, NULL); cache_state(prealloc, cached_state); - clear_state_bit(tree, prealloc, &clear_bits, 0); + clear_state_bit(tree, prealloc, &clear_bits, 0, NULL); prealloc = NULL; goto out; } @@ -1274,6 +1299,30 @@ int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, NULL, mask); } +int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + unsigned bits, gfp_t mask, + struct extent_changeset *changeset) +{ + /* + * We don't support EXTENT_LOCKED yet, as current changeset will + * record any bits changed, so for EXTENT_LOCKED case, it will + * either fail with -EEXIST or changeset will record the whole + * range. + */ + BUG_ON(bits & EXTENT_LOCKED); + + return __set_extent_bit(tree, start, end, bits, 0, NULL, NULL, mask, + changeset); +} + +int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + unsigned bits, int wake, int delete, + struct extent_state **cached, gfp_t mask) +{ + return __clear_extent_bit(tree, start, end, bits, wake, delete, + cached, mask, NULL); +} + int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, gfp_t mask) { @@ -1285,6 +1334,20 @@ int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, return clear_extent_bit(tree, start, end, bits, wake, 0, NULL, mask); } +int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + unsigned bits, gfp_t mask, + struct extent_changeset *changeset) +{ + /* + * Don't support EXTENT_LOCKED case, same reason as + * set_record_extent_bits(). + */ + BUG_ON(bits & EXTENT_LOCKED); + + return __clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask, + changeset); +} + int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached_state, gfp_t mask) { @@ -1343,7 +1406,7 @@ int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, while (1) { err = __set_extent_bit(tree, start, end, EXTENT_LOCKED | bits, EXTENT_LOCKED, &failed_start, - cached_state, GFP_NOFS); + cached_state, GFP_NOFS, NULL); if (err == -EEXIST) { wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED); start = failed_start; @@ -1365,7 +1428,7 @@ int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end) u64 failed_start; err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED, - &failed_start, NULL, GFP_NOFS); + &failed_start, NULL, GFP_NOFS, NULL); if (err == -EEXIST) { if (failed_start > start) clear_extent_bit(tree, start, failed_start - 1, @@ -2078,8 +2141,8 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical, return -EIO; } - printk_ratelimited_in_rcu(KERN_INFO - "BTRFS: read error corrected: ino %llu off %llu (dev %s sector %llu)\n", + btrfs_info_rl_in_rcu(fs_info, + "read error corrected: ino %llu off %llu (dev %s sector %llu)", btrfs_ino(inode), start, rcu_str_deref(dev->name), sector); bio_put(bio); @@ -2486,7 +2549,7 @@ int end_extent_writepage(struct page *page, int err, u64 start, u64 end) * Scheduling is not allowed, so the extent state tree is expected * to have one and only one object corresponding to this IO. */ -static void end_bio_extent_writepage(struct bio *bio, int err) +static void end_bio_extent_writepage(struct bio *bio) { struct bio_vec *bvec; u64 start; @@ -2516,7 +2579,7 @@ static void end_bio_extent_writepage(struct bio *bio, int err) start = page_offset(page); end = start + bvec->bv_offset + bvec->bv_len - 1; - if (end_extent_writepage(page, err, start, end)) + if (end_extent_writepage(page, bio->bi_error, start, end)) continue; end_page_writeback(page); @@ -2548,10 +2611,10 @@ endio_readpage_release_extent(struct extent_io_tree *tree, u64 start, u64 len, * Scheduling is not allowed, so the extent state tree is expected * to have one and only one object corresponding to this IO. */ -static void end_bio_extent_readpage(struct bio *bio, int err) +static void end_bio_extent_readpage(struct bio *bio) { struct bio_vec *bvec; - int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + int uptodate = !bio->bi_error; struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); struct extent_io_tree *tree; u64 offset = 0; @@ -2564,16 +2627,13 @@ static void end_bio_extent_readpage(struct bio *bio, int err) int ret; int i; - if (err) - uptodate = 0; - bio_for_each_segment_all(bvec, bio, i) { struct page *page = bvec->bv_page; struct inode *inode = page->mapping->host; pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, " - "mirror=%u\n", (u64)bio->bi_iter.bi_sector, err, - io_bio->mirror_num); + "mirror=%u\n", (u64)bio->bi_iter.bi_sector, + bio->bi_error, io_bio->mirror_num); tree = &BTRFS_I(inode)->io_tree; /* We always issue full-page reads, but if some block @@ -2614,8 +2674,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err) if (tree->ops && tree->ops->readpage_io_failed_hook) { ret = tree->ops->readpage_io_failed_hook(page, mirror); - if (!ret && !err && - test_bit(BIO_UPTODATE, &bio->bi_flags)) + if (!ret && !bio->bi_error) uptodate = 1; } else { /* @@ -2631,10 +2690,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err) ret = bio_readpage_error(bio, offset, page, start, end, mirror); if (ret == 0) { - uptodate = - test_bit(BIO_UPTODATE, &bio->bi_flags); - if (err) - uptodate = 0; + uptodate = !bio->bi_error; offset += len; continue; } @@ -2684,7 +2740,7 @@ readpage_ok: endio_readpage_release_extent(tree, extent_start, extent_len, uptodate); if (io_bio->end_io) - io_bio->end_io(io_bio, err); + io_bio->end_io(io_bio, bio->bi_error); bio_put(bio); } @@ -2730,6 +2786,12 @@ struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask) btrfs_bio->csum = NULL; btrfs_bio->csum_allocated = NULL; btrfs_bio->end_io = NULL; + +#ifdef CONFIG_BLK_CGROUP + /* FIXME, put this into bio_clone_bioset */ + if (bio->bi_css) + bio_associate_blkcg(new, bio->bi_css); +#endif } return new; } @@ -2790,6 +2852,7 @@ static int merge_bio(int rw, struct extent_io_tree *tree, struct page *page, } static int submit_extent_page(int rw, struct extent_io_tree *tree, + struct writeback_control *wbc, struct page *page, sector_t sector, size_t size, unsigned long offset, struct block_device *bdev, @@ -2798,13 +2861,12 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree, bio_end_io_t end_io_func, int mirror_num, unsigned long prev_bio_flags, - unsigned long bio_flags) + unsigned long bio_flags, + bool force_bio_submit) { int ret = 0; struct bio *bio; - int nr; int contig = 0; - int this_compressed = bio_flags & EXTENT_BIO_COMPRESSED; int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED; size_t page_size = min_t(size_t, size, PAGE_CACHE_SIZE); @@ -2816,6 +2878,7 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree, contig = bio_end_sector(bio) == sector; if (prev_bio_flags != bio_flags || !contig || + force_bio_submit || merge_bio(rw, tree, page, offset, page_size, bio, bio_flags) || bio_add_page(bio, page, page_size, offset) < page_size) { ret = submit_one_bio(rw, bio, mirror_num, @@ -2826,21 +2889,24 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree, } bio = NULL; } else { + if (wbc) + wbc_account_io(wbc, page, page_size); return 0; } } - if (this_compressed) - nr = BIO_MAX_PAGES; - else - nr = bio_get_nr_vecs(bdev); - bio = btrfs_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH); + bio = btrfs_bio_alloc(bdev, sector, BIO_MAX_PAGES, + GFP_NOFS | __GFP_HIGH); if (!bio) return -ENOMEM; bio_add_page(bio, page, page_size, offset); bio->bi_end_io = end_io_func; bio->bi_private = tree; + if (wbc) { + wbc_init_bio(wbc, bio); + wbc_account_io(wbc, page, page_size); + } if (bio_ret) *bio_ret = bio; @@ -2909,7 +2975,8 @@ static int __do_readpage(struct extent_io_tree *tree, get_extent_t *get_extent, struct extent_map **em_cached, struct bio **bio, int mirror_num, - unsigned long *bio_flags, int rw) + unsigned long *bio_flags, int rw, + u64 *prev_em_start) { struct inode *inode = page->mapping->host; u64 start = page_offset(page); @@ -2957,6 +3024,7 @@ static int __do_readpage(struct extent_io_tree *tree, } while (cur <= end) { unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1; + bool force_bio_submit = false; if (cur >= last_byte) { char *userpage; @@ -3007,6 +3075,49 @@ static int __do_readpage(struct extent_io_tree *tree, block_start = em->block_start; if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) block_start = EXTENT_MAP_HOLE; + + /* + * If we have a file range that points to a compressed extent + * and it's followed by a consecutive file range that points to + * to the same compressed extent (possibly with a different + * offset and/or length, so it either points to the whole extent + * or only part of it), we must make sure we do not submit a + * single bio to populate the pages for the 2 ranges because + * this makes the compressed extent read zero out the pages + * belonging to the 2nd range. Imagine the following scenario: + * + * File layout + * [0 - 8K] [8K - 24K] + * | | + * | | + * points to extent X, points to extent X, + * offset 4K, length of 8K offset 0, length 16K + * + * [extent X, compressed length = 4K uncompressed length = 16K] + * + * If the bio to read the compressed extent covers both ranges, + * it will decompress extent X into the pages belonging to the + * first range and then it will stop, zeroing out the remaining + * pages that belong to the other range that points to extent X. + * So here we make sure we submit 2 bios, one for the first + * range and another one for the third range. Both will target + * the same physical extent from disk, but we can't currently + * make the compressed bio endio callback populate the pages + * for both ranges because each compressed bio is tightly + * coupled with a single extent map, and each range can have + * an extent map with a different offset value relative to the + * uncompressed data of our extent and different lengths. This + * is a corner case so we prioritize correctness over + * non-optimal behavior (submitting 2 bios for the same extent). + */ + if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) && + prev_em_start && *prev_em_start != (u64)-1 && + *prev_em_start != em->orig_start) + force_bio_submit = true; + + if (prev_em_start) + *prev_em_start = em->orig_start; + free_extent_map(em); em = NULL; @@ -3022,8 +3133,12 @@ static int __do_readpage(struct extent_io_tree *tree, set_extent_uptodate(tree, cur, cur + iosize - 1, &cached, GFP_NOFS); - unlock_extent_cached(tree, cur, cur + iosize - 1, - &cached, GFP_NOFS); + if (parent_locked) + free_extent_state(cached); + else + unlock_extent_cached(tree, cur, + cur + iosize - 1, + &cached, GFP_NOFS); cur = cur + iosize; pg_offset += iosize; continue; @@ -3051,12 +3166,13 @@ static int __do_readpage(struct extent_io_tree *tree, } pnr -= page->index; - ret = submit_extent_page(rw, tree, page, + ret = submit_extent_page(rw, tree, NULL, page, sector, disk_io_size, pg_offset, bdev, bio, pnr, end_bio_extent_readpage, mirror_num, *bio_flags, - this_bio_flag); + this_bio_flag, + force_bio_submit); if (!ret) { nr++; *bio_flags = this_bio_flag; @@ -3083,7 +3199,8 @@ static inline void __do_contiguous_readpages(struct extent_io_tree *tree, get_extent_t *get_extent, struct extent_map **em_cached, struct bio **bio, int mirror_num, - unsigned long *bio_flags, int rw) + unsigned long *bio_flags, int rw, + u64 *prev_em_start) { struct inode *inode; struct btrfs_ordered_extent *ordered; @@ -3103,7 +3220,7 @@ static inline void __do_contiguous_readpages(struct extent_io_tree *tree, for (index = 0; index < nr_pages; index++) { __do_readpage(tree, pages[index], get_extent, em_cached, bio, - mirror_num, bio_flags, rw); + mirror_num, bio_flags, rw, prev_em_start); page_cache_release(pages[index]); } } @@ -3113,7 +3230,8 @@ static void __extent_readpages(struct extent_io_tree *tree, int nr_pages, get_extent_t *get_extent, struct extent_map **em_cached, struct bio **bio, int mirror_num, - unsigned long *bio_flags, int rw) + unsigned long *bio_flags, int rw, + u64 *prev_em_start) { u64 start = 0; u64 end = 0; @@ -3134,7 +3252,7 @@ static void __extent_readpages(struct extent_io_tree *tree, index - first_index, start, end, get_extent, em_cached, bio, mirror_num, bio_flags, - rw); + rw, prev_em_start); start = page_start; end = start + PAGE_CACHE_SIZE - 1; first_index = index; @@ -3145,7 +3263,8 @@ static void __extent_readpages(struct extent_io_tree *tree, __do_contiguous_readpages(tree, &pages[first_index], index - first_index, start, end, get_extent, em_cached, bio, - mirror_num, bio_flags, rw); + mirror_num, bio_flags, rw, + prev_em_start); } static int __extent_read_full_page(struct extent_io_tree *tree, @@ -3171,7 +3290,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree, } ret = __do_readpage(tree, page, get_extent, NULL, bio, mirror_num, - bio_flags, rw); + bio_flags, rw, NULL); return ret; } @@ -3197,7 +3316,7 @@ int extent_read_full_page_nolock(struct extent_io_tree *tree, struct page *page, int ret; ret = __do_readpage(tree, page, get_extent, NULL, &bio, mirror_num, - &bio_flags, READ); + &bio_flags, READ, NULL); if (bio) ret = submit_one_bio(READ, bio, mirror_num, bio_flags); return ret; @@ -3446,11 +3565,11 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, page->index, cur, end); } - ret = submit_extent_page(write_flags, tree, page, + ret = submit_extent_page(write_flags, tree, wbc, page, sector, iosize, pg_offset, bdev, &epd->bio, max_nr, end_bio_extent_writepage, - 0, 0, 0); + 0, 0, 0, false); if (ret) SetPageError(page); } @@ -3696,7 +3815,7 @@ static void set_btree_ioerr(struct page *page) } } -static void end_bio_extent_buffer_writepage(struct bio *bio, int err) +static void end_bio_extent_buffer_writepage(struct bio *bio) { struct bio_vec *bvec; struct extent_buffer *eb; @@ -3709,7 +3828,8 @@ static void end_bio_extent_buffer_writepage(struct bio *bio, int err) BUG_ON(!eb); done = atomic_dec_and_test(&eb->io_pages); - if (err || test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) { + if (bio->bi_error || + test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) { ClearPageUptodate(page); set_btree_ioerr(page); } @@ -3749,10 +3869,10 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, clear_page_dirty_for_io(p); set_page_writeback(p); - ret = submit_extent_page(rw, tree, p, offset >> 9, + ret = submit_extent_page(rw, tree, wbc, p, offset >> 9, PAGE_CACHE_SIZE, 0, bdev, &epd->bio, -1, end_bio_extent_buffer_writepage, - 0, epd->bio_flags, bio_flags); + 0, epd->bio_flags, bio_flags, false); epd->bio_flags = bio_flags; if (ret) { set_btree_ioerr(p); @@ -4156,6 +4276,7 @@ int extent_readpages(struct extent_io_tree *tree, struct page *page; struct extent_map *em_cached = NULL; int nr = 0; + u64 prev_em_start = (u64)-1; for (page_idx = 0; page_idx < nr_pages; page_idx++) { page = list_entry(pages->prev, struct page, lru); @@ -4172,12 +4293,12 @@ int extent_readpages(struct extent_io_tree *tree, if (nr < ARRAY_SIZE(pagepool)) continue; __extent_readpages(tree, pagepool, nr, get_extent, &em_cached, - &bio, 0, &bio_flags, READ); + &bio, 0, &bio_flags, READ, &prev_em_start); nr = 0; } if (nr) __extent_readpages(tree, pagepool, nr, get_extent, &em_cached, - &bio, 0, &bio_flags, READ); + &bio, 0, &bio_flags, READ, &prev_em_start); if (em_cached) free_extent_map(em_cached); @@ -4614,9 +4735,7 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, { struct extent_buffer *eb = NULL; - eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS); - if (eb == NULL) - return NULL; + eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS|__GFP_NOFAIL); eb->start = start; eb->len = len; eb->fs_info = fs_info; @@ -4874,7 +4993,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, return NULL; for (i = 0; i < num_pages; i++, index++) { - p = find_or_create_page(mapping, index, GFP_NOFS); + p = find_or_create_page(mapping, index, GFP_NOFS|__GFP_NOFAIL); if (!p) goto free_eb; @@ -5514,13 +5633,15 @@ void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, unsigned long src_i; if (src_offset + len > dst->len) { - printk(KERN_ERR "BTRFS: memmove bogus src_offset %lu move " - "len %lu dst len %lu\n", src_offset, len, dst->len); + btrfs_err(dst->fs_info, + "memmove bogus src_offset %lu move " + "len %lu dst len %lu", src_offset, len, dst->len); BUG_ON(1); } if (dst_offset + len > dst->len) { - printk(KERN_ERR "BTRFS: memmove bogus dst_offset %lu move " - "len %lu dst len %lu\n", dst_offset, len, dst->len); + btrfs_err(dst->fs_info, + "memmove bogus dst_offset %lu move " + "len %lu dst len %lu", dst_offset, len, dst->len); BUG_ON(1); } @@ -5560,13 +5681,13 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, unsigned long src_i; if (src_offset + len > dst->len) { - printk(KERN_ERR "BTRFS: memmove bogus src_offset %lu move " - "len %lu len %lu\n", src_offset, len, dst->len); + btrfs_err(dst->fs_info, "memmove bogus src_offset %lu move " + "len %lu len %lu", src_offset, len, dst->len); BUG_ON(1); } if (dst_offset + len > dst->len) { - printk(KERN_ERR "BTRFS: memmove bogus dst_offset %lu move " - "len %lu len %lu\n", dst_offset, len, dst->len); + btrfs_err(dst->fs_info, "memmove bogus dst_offset %lu move " + "len %lu len %lu", dst_offset, len, dst->len); BUG_ON(1); } if (dst_offset < src_offset) { diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index c668f36898d3..f4c1ae11855f 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -2,6 +2,7 @@ #define __EXTENTIO__ #include <linux/rbtree.h> +#include "ulist.h" /* bits for the extent state */ #define EXTENT_DIRTY (1U << 0) @@ -18,6 +19,7 @@ #define EXTENT_NEED_WAIT (1U << 13) #define EXTENT_DAMAGED (1U << 14) #define EXTENT_NORESERVE (1U << 15) +#define EXTENT_QGROUP_RESERVED (1U << 16) #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) #define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC) @@ -161,6 +163,17 @@ struct extent_buffer { #endif }; +/* + * Structure to record how many bytes and which ranges are set/cleared + */ +struct extent_changeset { + /* How many bytes are set/cleared in this operation */ + u64 bytes_changed; + + /* Changed ranges */ + struct ulist *range_changed; +}; + static inline void extent_set_compress_type(unsigned long *bio_flags, int compress_type) { @@ -210,11 +223,17 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state *cached_state); int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, gfp_t mask); +int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + unsigned bits, gfp_t mask, + struct extent_changeset *changeset); int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, int wake, int delete, struct extent_state **cached, gfp_t mask); int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, gfp_t mask); +int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + unsigned bits, gfp_t mask, + struct extent_changeset *changeset); int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, u64 *failed_start, struct extent_state **cached_state, gfp_t mask); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index b823fac91c92..6bd5ce9d75f0 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -847,7 +847,7 @@ next_slot: disk_bytenr, num_bytes, 0, root->root_key.objectid, new_key.objectid, - start - extent_offset, 1); + start - extent_offset); BUG_ON(ret); /* -ENOMEM */ } key.offset = start; @@ -925,7 +925,7 @@ delete_extent_item: disk_bytenr, num_bytes, 0, root->root_key.objectid, key.objectid, key.offset - - extent_offset, 0); + extent_offset); BUG_ON(ret); /* -ENOMEM */ inode_sub_bytes(inode, extent_end - key.offset); @@ -1204,7 +1204,7 @@ again: ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0, root->root_key.objectid, - ino, orig_offset, 1); + ino, orig_offset); BUG_ON(ret); /* -ENOMEM */ if (split == start) { @@ -1231,7 +1231,7 @@ again: del_nr++; ret = btrfs_free_extent(trans, root, bytenr, num_bytes, 0, root->root_key.objectid, - ino, orig_offset, 0); + ino, orig_offset); BUG_ON(ret); /* -ENOMEM */ } other_start = 0; @@ -1248,7 +1248,7 @@ again: del_nr++; ret = btrfs_free_extent(trans, root, bytenr, num_bytes, 0, root->root_key.objectid, - ino, orig_offset, 0); + ino, orig_offset); BUG_ON(ret); /* -ENOMEM */ } if (del_nr == 0) { @@ -1469,7 +1469,6 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, u64 release_bytes = 0; u64 lockstart; u64 lockend; - unsigned long first_index; size_t num_written = 0; int nrptrs; int ret = 0; @@ -1485,8 +1484,6 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, if (!pages) return -ENOMEM; - first_index = pos >> PAGE_CACHE_SHIFT; - while (iov_iter_count(i) > 0) { size_t offset = pos & (PAGE_CACHE_SIZE - 1); size_t write_bytes = min(iov_iter_count(i), @@ -1510,12 +1507,17 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, } reserve_bytes = num_pages << PAGE_CACHE_SHIFT; - ret = btrfs_check_data_free_space(inode, reserve_bytes, write_bytes); - if (ret == -ENOSPC && - (BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | - BTRFS_INODE_PREALLOC))) { + + if (BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | + BTRFS_INODE_PREALLOC)) { ret = check_can_nocow(inode, pos, &write_bytes); + if (ret < 0) + break; if (ret > 0) { + /* + * For nodata cow case, no need to reserve + * data space. + */ only_release_metadata = true; /* * our prealloc extent may be smaller than @@ -1524,20 +1526,19 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, num_pages = DIV_ROUND_UP(write_bytes + offset, PAGE_CACHE_SIZE); reserve_bytes = num_pages << PAGE_CACHE_SHIFT; - ret = 0; - } else { - ret = -ENOSPC; + goto reserve_metadata; } } - - if (ret) + ret = btrfs_check_data_free_space(inode, pos, write_bytes); + if (ret < 0) break; +reserve_metadata: ret = btrfs_delalloc_reserve_metadata(inode, reserve_bytes); if (ret) { if (!only_release_metadata) - btrfs_free_reserved_data_space(inode, - reserve_bytes); + btrfs_free_reserved_data_space(inode, pos, + write_bytes); else btrfs_end_write_no_snapshoting(root); break; @@ -1603,12 +1604,17 @@ again: BTRFS_I(inode)->outstanding_extents++; spin_unlock(&BTRFS_I(inode)->lock); } - if (only_release_metadata) + if (only_release_metadata) { btrfs_delalloc_release_metadata(inode, release_bytes); - else - btrfs_delalloc_release_space(inode, + } else { + u64 __pos; + + __pos = round_down(pos, root->sectorsize) + + (dirty_pages << PAGE_CACHE_SHIFT); + btrfs_delalloc_release_space(inode, __pos, release_bytes); + } } release_bytes = dirty_pages << PAGE_CACHE_SHIFT; @@ -1660,7 +1666,7 @@ again: btrfs_end_write_no_snapshoting(root); btrfs_delalloc_release_metadata(inode, release_bytes); } else { - btrfs_delalloc_release_space(inode, release_bytes); + btrfs_delalloc_release_space(inode, pos, release_bytes); } } @@ -2266,7 +2272,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) u64 drop_end; int ret = 0; int err = 0; - int rsv_count; + unsigned int rsv_count; bool same_page; bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES); u64 ino_size; @@ -2488,6 +2494,19 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) trans->block_rsv = &root->fs_info->trans_block_rsv; /* + * If we are using the NO_HOLES feature we might have had already an + * hole that overlaps a part of the region [lockstart, lockend] and + * ends at (or beyond) lockend. Since we have no file extent items to + * represent holes, drop_end can be less than lockend and so we must + * make sure we have an extent map representing the existing hole (the + * call to __btrfs_drop_extents() might have dropped the existing extent + * map representing the existing hole), otherwise the fast fsync path + * will not record the existence of the hole region + * [existing_hole_start, lockend]. + */ + if (drop_end <= lockend) + drop_end = lockend + 1; + /* * Don't insert file hole extent item if it's for a range beyond eof * (because it's useless) or if it represents a 0 bytes range (when * cur_offset == drop_end). @@ -2541,17 +2560,61 @@ out_only_mutex: return err; } +/* Helper structure to record which range is already reserved */ +struct falloc_range { + struct list_head list; + u64 start; + u64 len; +}; + +/* + * Helper function to add falloc range + * + * Caller should have locked the larger range of extent containing + * [start, len) + */ +static int add_falloc_range(struct list_head *head, u64 start, u64 len) +{ + struct falloc_range *prev = NULL; + struct falloc_range *range = NULL; + + if (list_empty(head)) + goto insert; + + /* + * As fallocate iterate by bytenr order, we only need to check + * the last range. + */ + prev = list_entry(head->prev, struct falloc_range, list); + if (prev->start + prev->len == start) { + prev->len += len; + return 0; + } +insert: + range = kmalloc(sizeof(*range), GFP_NOFS); + if (!range) + return -ENOMEM; + range->start = start; + range->len = len; + list_add_tail(&range->list, head); + return 0; +} + static long btrfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { struct inode *inode = file_inode(file); struct extent_state *cached_state = NULL; + struct falloc_range *range; + struct falloc_range *tmp; + struct list_head reserve_list; u64 cur_offset; u64 last_byte; u64 alloc_start; u64 alloc_end; u64 alloc_hint = 0; u64 locked_end; + u64 actual_end = 0; struct extent_map *em; int blocksize = BTRFS_I(inode)->root->sectorsize; int ret; @@ -2567,11 +2630,12 @@ static long btrfs_fallocate(struct file *file, int mode, return btrfs_punch_hole(inode, offset, len); /* - * Make sure we have enough space before we do the - * allocation. + * Only trigger disk allocation, don't trigger qgroup reserve + * + * For qgroup space, it will be checked later. */ - ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start, alloc_end - alloc_start); - if (ret) + ret = btrfs_alloc_data_chunk_ondemand(inode, alloc_end - alloc_start); + if (ret < 0) return ret; mutex_lock(&inode->i_mutex); @@ -2579,12 +2643,19 @@ static long btrfs_fallocate(struct file *file, int mode, if (ret) goto out; + /* + * TODO: Move these two operations after we have checked + * accurate reserved space, or fallocate can still fail but + * with page truncated or size expanded. + * + * But that's a minor problem and won't do much harm BTW. + */ if (alloc_start > inode->i_size) { ret = btrfs_cont_expand(inode, i_size_read(inode), alloc_start); if (ret) goto out; - } else { + } else if (offset + len > inode->i_size) { /* * If we are fallocating from the end of the file onward we * need to zero out the end of the page if i_size lands in the @@ -2637,10 +2708,10 @@ static long btrfs_fallocate(struct file *file, int mode, } } + /* First, check if we exceed the qgroup limit */ + INIT_LIST_HEAD(&reserve_list); cur_offset = alloc_start; while (1) { - u64 actual_end; - em = btrfs_get_extent(inode, NULL, 0, cur_offset, alloc_end - cur_offset, 0); if (IS_ERR_OR_NULL(em)) { @@ -2653,57 +2724,82 @@ static long btrfs_fallocate(struct file *file, int mode, last_byte = min(extent_map_end(em), alloc_end); actual_end = min_t(u64, extent_map_end(em), offset + len); last_byte = ALIGN(last_byte, blocksize); - if (em->block_start == EXTENT_MAP_HOLE || (cur_offset >= inode->i_size && !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { - ret = btrfs_prealloc_file_range(inode, mode, cur_offset, - last_byte - cur_offset, - 1 << inode->i_blkbits, - offset + len, - &alloc_hint); - } else if (actual_end > inode->i_size && - !(mode & FALLOC_FL_KEEP_SIZE)) { - struct btrfs_trans_handle *trans; - struct btrfs_root *root = BTRFS_I(inode)->root; - - /* - * We didn't need to allocate any more space, but we - * still extended the size of the file so we need to - * update i_size and the inode item. - */ - trans = btrfs_start_transaction(root, 1); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - } else { - inode->i_ctime = CURRENT_TIME; - i_size_write(inode, actual_end); - btrfs_ordered_update_i_size(inode, actual_end, - NULL); - ret = btrfs_update_inode(trans, root, inode); - if (ret) - btrfs_end_transaction(trans, root); - else - ret = btrfs_end_transaction(trans, - root); + ret = add_falloc_range(&reserve_list, cur_offset, + last_byte - cur_offset); + if (ret < 0) { + free_extent_map(em); + break; } + ret = btrfs_qgroup_reserve_data(inode, cur_offset, + last_byte - cur_offset); + if (ret < 0) + break; } free_extent_map(em); - if (ret < 0) - break; - cur_offset = last_byte; - if (cur_offset >= alloc_end) { - ret = 0; + if (cur_offset >= alloc_end) break; + } + + /* + * If ret is still 0, means we're OK to fallocate. + * Or just cleanup the list and exit. + */ + list_for_each_entry_safe(range, tmp, &reserve_list, list) { + if (!ret) + ret = btrfs_prealloc_file_range(inode, mode, + range->start, + range->len, 1 << inode->i_blkbits, + offset + len, &alloc_hint); + list_del(&range->list); + kfree(range); + } + if (ret < 0) + goto out_unlock; + + if (actual_end > inode->i_size && + !(mode & FALLOC_FL_KEEP_SIZE)) { + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(inode)->root; + + /* + * We didn't need to allocate any more space, but we + * still extended the size of the file so we need to + * update i_size and the inode item. + */ + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + } else { + inode->i_ctime = CURRENT_TIME; + i_size_write(inode, actual_end); + btrfs_ordered_update_i_size(inode, actual_end, NULL); + ret = btrfs_update_inode(trans, root, inode); + if (ret) + btrfs_end_transaction(trans, root); + else + ret = btrfs_end_transaction(trans, root); } } +out_unlock: unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, &cached_state, GFP_NOFS); out: + /* + * As we waited the extent range, the data_rsv_map must be empty + * in the range, as written data range will be released from it. + * And for prealloacted extent, it will also be released when + * its metadata is written. + * So this is completely used as cleanup. + */ + btrfs_qgroup_free_data(inode, alloc_start, alloc_end - alloc_start); mutex_unlock(&inode->i_mutex); /* Let go of our reservation. */ - btrfs_free_reserved_data_space(inode, alloc_end - alloc_start); + btrfs_free_reserved_data_space(inode, alloc_start, + alloc_end - alloc_start); return ret; } diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index fb5a6b1c62a6..0948d34cb84a 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -450,9 +450,9 @@ static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation) gen = io_ctl->cur; if (le64_to_cpu(*gen) != generation) { - printk_ratelimited(KERN_ERR "BTRFS: space cache generation " - "(%Lu) does not match inode (%Lu)\n", *gen, - generation); + btrfs_err_rl(io_ctl->root->fs_info, + "space cache generation (%llu) does not match inode (%llu)", + *gen, generation); io_ctl_unmap_page(io_ctl); return -EIO; } @@ -506,8 +506,8 @@ static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index) PAGE_CACHE_SIZE - offset); btrfs_csum_final(crc, (char *)&crc); if (val != crc) { - printk_ratelimited(KERN_ERR "BTRFS: csum mismatch on free " - "space cache\n"); + btrfs_err_rl(io_ctl->root->fs_info, + "csum mismatch on free space cache"); io_ctl_unmap_page(io_ctl); return -EIO; } @@ -1215,7 +1215,7 @@ out: * @offset - the offset for the key we'll insert * * This function writes out a free space cache struct to disk for quick recovery - * on mount. This will return 0 if it was successfull in writing the cache out, + * on mount. This will return 0 if it was successful in writing the cache out, * or an errno if it was not. */ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, @@ -1730,7 +1730,7 @@ static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl, */ static int search_bitmap(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *bitmap_info, u64 *offset, - u64 *bytes) + u64 *bytes, bool for_alloc) { unsigned long found_bits = 0; unsigned long max_bits = 0; @@ -1738,11 +1738,26 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl, unsigned long next_zero; unsigned long extent_bits; + /* + * Skip searching the bitmap if we don't have a contiguous section that + * is large enough for this allocation. + */ + if (for_alloc && + bitmap_info->max_extent_size && + bitmap_info->max_extent_size < *bytes) { + *bytes = bitmap_info->max_extent_size; + return -1; + } + i = offset_to_bit(bitmap_info->offset, ctl->unit, max_t(u64, *offset, bitmap_info->offset)); bits = bytes_to_bits(*bytes, ctl->unit); for_each_set_bit_from(i, bitmap_info->bitmap, BITS_PER_BITMAP) { + if (for_alloc && bits == 1) { + found_bits = 1; + break; + } next_zero = find_next_zero_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i); extent_bits = next_zero - i; @@ -1762,6 +1777,7 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl, } *bytes = (u64)(max_bits) * ctl->unit; + bitmap_info->max_extent_size = *bytes; return -1; } @@ -1813,7 +1829,7 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes, if (entry->bitmap) { u64 size = *bytes; - ret = search_bitmap(ctl, entry, &tmp, &size); + ret = search_bitmap(ctl, entry, &tmp, &size, true); if (!ret) { *offset = tmp; *bytes = size; @@ -1874,7 +1890,8 @@ again: search_start = *offset; search_bytes = ctl->unit; search_bytes = min(search_bytes, end - search_start + 1); - ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes); + ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes, + false); if (ret < 0 || search_start != *offset) return -EINVAL; @@ -1919,7 +1936,7 @@ again: search_start = *offset; search_bytes = ctl->unit; ret = search_bitmap(ctl, bitmap_info, &search_start, - &search_bytes); + &search_bytes, false); if (ret < 0 || search_start != *offset) return -EAGAIN; @@ -1943,6 +1960,12 @@ static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl, bitmap_set_bits(ctl, info, offset, bytes_to_set); + /* + * We set some bytes, we have no idea what the max extent size is + * anymore. + */ + info->max_extent_size = 0; + return bytes_to_set; } @@ -1951,12 +1974,19 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info) { struct btrfs_block_group_cache *block_group = ctl->private; + bool forced = false; + +#ifdef CONFIG_BTRFS_DEBUG + if (btrfs_should_fragment_free_space(block_group->fs_info->extent_root, + block_group)) + forced = true; +#endif /* * If we are below the extents threshold then we can add this as an * extent, and don't have to deal with the bitmap */ - if (ctl->free_extents < ctl->extents_thresh) { + if (!forced && ctl->free_extents < ctl->extents_thresh) { /* * If this block group has some small extents we don't want to * use up all of our free slots in the cache with them, we want @@ -2661,7 +2691,7 @@ static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group, search_start = min_start; search_bytes = bytes; - err = search_bitmap(ctl, entry, &search_start, &search_bytes); + err = search_bitmap(ctl, entry, &search_start, &search_bytes, true); if (err) { if (search_bytes > *max_extent_size) *max_extent_size = search_bytes; @@ -2775,6 +2805,7 @@ static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group, unsigned long want_bits; unsigned long min_bits; unsigned long found_bits; + unsigned long max_bits = 0; unsigned long start = 0; unsigned long total_found = 0; int ret; @@ -2784,6 +2815,13 @@ static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group, want_bits = bytes_to_bits(bytes, ctl->unit); min_bits = bytes_to_bits(min_bytes, ctl->unit); + /* + * Don't bother looking for a cluster in this bitmap if it's heavily + * fragmented. + */ + if (entry->max_extent_size && + entry->max_extent_size < cont1_bytes) + return -ENOSPC; again: found_bits = 0; for_each_set_bit_from(i, entry->bitmap, BITS_PER_BITMAP) { @@ -2791,13 +2829,19 @@ again: BITS_PER_BITMAP, i); if (next_zero - i >= min_bits) { found_bits = next_zero - i; + if (found_bits > max_bits) + max_bits = found_bits; break; } + if (next_zero - i > max_bits) + max_bits = next_zero - i; i = next_zero; } - if (!found_bits) + if (!found_bits) { + entry->max_extent_size = (u64)max_bits * ctl->unit; return -ENOSPC; + } if (!total_found) { start = i; @@ -3056,6 +3100,7 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster) spin_lock_init(&cluster->refill_lock); cluster->root = RB_ROOT; cluster->max_size = 0; + cluster->fragmented = false; INIT_LIST_HEAD(&cluster->block_group_list); cluster->block_group = NULL; } @@ -3223,7 +3268,7 @@ static int trim_bitmaps(struct btrfs_block_group_cache *block_group, } bytes = minlen; - ret2 = search_bitmap(ctl, entry, &start, &bytes); + ret2 = search_bitmap(ctl, entry, &start, &bytes, false); if (ret2 || start >= end) { spin_unlock(&ctl->tree_lock); mutex_unlock(&ctl->cache_writeout_mutex); @@ -3272,35 +3317,23 @@ next: return ret; } -int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group, - u64 *trimmed, u64 start, u64 end, u64 minlen) +void btrfs_get_block_group_trimming(struct btrfs_block_group_cache *cache) { - int ret; + atomic_inc(&cache->trimming); +} - *trimmed = 0; +void btrfs_put_block_group_trimming(struct btrfs_block_group_cache *block_group) +{ + struct extent_map_tree *em_tree; + struct extent_map *em; + bool cleanup; spin_lock(&block_group->lock); - if (block_group->removed) { - spin_unlock(&block_group->lock); - return 0; - } - atomic_inc(&block_group->trimming); + cleanup = (atomic_dec_and_test(&block_group->trimming) && + block_group->removed); spin_unlock(&block_group->lock); - ret = trim_no_bitmap(block_group, trimmed, start, end, minlen); - if (ret) - goto out; - - ret = trim_bitmaps(block_group, trimmed, start, end, minlen); -out: - spin_lock(&block_group->lock); - if (atomic_dec_and_test(&block_group->trimming) && - block_group->removed) { - struct extent_map_tree *em_tree; - struct extent_map *em; - - spin_unlock(&block_group->lock); - + if (cleanup) { lock_chunks(block_group->fs_info->chunk_root); em_tree = &block_group->fs_info->mapping_tree.map_tree; write_lock(&em_tree->lock); @@ -3324,10 +3357,31 @@ out: * this block group have left 1 entry each one. Free them. */ __btrfs_remove_free_space_cache(block_group->free_space_ctl); - } else { + } +} + +int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group, + u64 *trimmed, u64 start, u64 end, u64 minlen) +{ + int ret; + + *trimmed = 0; + + spin_lock(&block_group->lock); + if (block_group->removed) { spin_unlock(&block_group->lock); + return 0; } + btrfs_get_block_group_trimming(block_group); + spin_unlock(&block_group->lock); + ret = trim_no_bitmap(block_group, trimmed, start, end, minlen); + if (ret) + goto out; + + ret = trim_bitmaps(block_group, trimmed, start, end, minlen); +out: + btrfs_put_block_group_trimming(block_group); return ret; } @@ -3367,7 +3421,7 @@ u64 btrfs_find_ino_for_alloc(struct btrfs_root *fs_root) u64 count = 1; int ret; - ret = search_bitmap(ctl, entry, &offset, &count); + ret = search_bitmap(ctl, entry, &offset, &count, true); /* Logic error; Should be empty if it can't find anything */ ASSERT(!ret); @@ -3523,6 +3577,7 @@ again: spin_lock(&ctl->tree_lock); info->offset = offset; info->bytes = bytes; + info->max_extent_size = 0; ret = link_free_space(ctl, info); spin_unlock(&ctl->tree_lock); if (ret) @@ -3550,6 +3605,7 @@ again: } bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes); + bytes -= bytes_added; offset += bytes_added; spin_unlock(&ctl->tree_lock); @@ -3593,7 +3649,7 @@ have_info: bit_off = offset; bit_bytes = ctl->unit; - ret = search_bitmap(ctl, info, &bit_off, &bit_bytes); + ret = search_bitmap(ctl, info, &bit_off, &bit_bytes, false); if (!ret) { if (bit_off == offset) { ret = 1; diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h index a16a029ad3b1..f251865eb6f3 100644 --- a/fs/btrfs/free-space-cache.h +++ b/fs/btrfs/free-space-cache.h @@ -23,6 +23,7 @@ struct btrfs_free_space { struct rb_node offset_index; u64 offset; u64 bytes; + u64 max_extent_size; unsigned long *bitmap; struct list_head list; }; diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index 265e03c73f4d..be4d22a5022f 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -157,7 +157,7 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans, */ if (!btrfs_find_name_in_ext_backref(path, ref_objectid, name, name_len, &extref)) { - btrfs_std_error(root->fs_info, -ENOENT); + btrfs_std_error(root->fs_info, -ENOENT, NULL); ret = -EROFS; goto out; } diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index d4a582ac3f73..767a6056ac45 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -488,17 +488,17 @@ again: /* Just to make sure we have enough space */ prealloc += 8 * PAGE_CACHE_SIZE; - ret = btrfs_delalloc_reserve_space(inode, prealloc); + ret = btrfs_delalloc_reserve_space(inode, 0, prealloc); if (ret) goto out_put; ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc, prealloc, prealloc, &alloc_hint); if (ret) { - btrfs_delalloc_release_space(inode, prealloc); + btrfs_delalloc_release_space(inode, 0, prealloc); goto out_put; } - btrfs_free_reserved_data_space(inode, prealloc); + btrfs_free_reserved_data_space(inode, 0, prealloc); ret = btrfs_write_out_ino_cache(root, trans, path, inode); out_put: diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index e33dff356460..4439fbb4ff45 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -310,6 +310,13 @@ static noinline int cow_file_range_inline(struct btrfs_root *root, btrfs_delalloc_release_metadata(inode, end + 1 - start); btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0); out: + /* + * Don't forget to free the reserved space, as for inlined extent + * it won't count as data extent, free them directly here. + * And at reserve time, it's always aligned to page size, so + * just free one page here. + */ + btrfs_qgroup_free_data(inode, 0, PAGE_CACHE_SIZE); btrfs_free_path(path); btrfs_end_transaction(trans, root); return ret; @@ -1096,6 +1103,9 @@ static noinline void async_cow_submit(struct btrfs_work *work) nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >> PAGE_CACHE_SHIFT; + /* + * atomic_sub_return implies a barrier for waitqueue_active + */ if (atomic_sub_return(nr_pages, &root->fs_info->async_delalloc_pages) < 5 * 1024 * 1024 && waitqueue_active(&root->fs_info->async_submit_wait)) @@ -1766,7 +1776,8 @@ static void btrfs_clear_bit_hook(struct inode *inode, if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID && do_list && !(state->state & EXTENT_NORESERVE)) - btrfs_free_reserved_data_space(inode, len); + btrfs_free_reserved_data_space_noquota(inode, + state->start, len); __percpu_counter_add(&root->fs_info->delalloc_bytes, -len, root->fs_info->delalloc_batch); @@ -1845,8 +1856,10 @@ static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio, int ret; ret = btrfs_map_bio(root, rw, bio, mirror_num, 1); - if (ret) - bio_endio(bio, ret); + if (ret) { + bio->bi_error = ret; + bio_endio(bio); + } return ret; } @@ -1859,15 +1872,15 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, u64 bio_offset) { struct btrfs_root *root = BTRFS_I(inode)->root; + enum btrfs_wq_endio_type metadata = BTRFS_WQ_ENDIO_DATA; int ret = 0; int skip_sum; - int metadata = 0; int async = !atomic_read(&BTRFS_I(inode)->sync_writers); skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; if (btrfs_is_free_space_inode(inode)) - metadata = 2; + metadata = BTRFS_WQ_ENDIO_FREE_SPACE; if (!(rw & REQ_WRITE)) { ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata); @@ -1906,8 +1919,10 @@ mapit: ret = btrfs_map_bio(root, rw, bio, mirror_num, 0); out: - if (ret < 0) - bio_endio(bio, ret); + if (ret < 0) { + bio->bi_error = ret; + bio_endio(bio); + } return ret; } @@ -1985,7 +2000,8 @@ again: goto again; } - ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); + ret = btrfs_delalloc_reserve_space(inode, page_start, + PAGE_CACHE_SIZE); if (ret) { mapping_set_error(page->mapping, ret); end_extent_writepage(page, ret, page_start, page_end); @@ -2111,7 +2127,13 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, ins.type = BTRFS_EXTENT_ITEM_KEY; ret = btrfs_alloc_reserved_file_extent(trans, root, root->root_key.objectid, - btrfs_ino(inode), file_pos, &ins); + btrfs_ino(inode), file_pos, + ram_bytes, &ins); + /* + * Release the reserved range from inode dirty range map, as it is + * already moved into delayed_ref_head + */ + btrfs_qgroup_release_data(inode, file_pos, ram_bytes); out: btrfs_free_path(path); @@ -2569,7 +2591,7 @@ again: ret = btrfs_inc_extent_ref(trans, root, new->bytenr, new->disk_len, 0, backref->root_id, backref->inum, - new->file_pos, 0); /* start - extent_offset */ + new->file_pos); /* start - extent_offset */ if (ret) { btrfs_abort_transaction(trans, root, ret); goto out_free_path; @@ -2595,7 +2617,6 @@ static void free_sa_defrag_extent(struct new_sa_defrag_extent *new) return; list_for_each_entry_safe(old, tmp, &new->head, list) { - list_del(&old->list); kfree(old); } kfree(new); @@ -2820,6 +2841,14 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */ + + /* + * For mwrite(mmap + memset to write) case, we still reserve + * space for NOCOW range. + * As NOCOW won't cause a new delayed ref, just free the space + */ + btrfs_qgroup_free_data(inode, ordered_extent->file_offset, + ordered_extent->len); btrfs_ordered_update_i_size(inode, 0, ordered_extent); if (nolock) trans = btrfs_join_transaction_nolock(root); @@ -3014,8 +3043,6 @@ static int __readpage_endio_check(struct inode *inode, char *kaddr; u32 csum_expected; u32 csum = ~(u32)0; - static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, - DEFAULT_RATELIMIT_BURST); csum_expected = *(((u32 *)io_bio->csum) + icsum); @@ -3028,9 +3055,8 @@ static int __readpage_endio_check(struct inode *inode, kunmap_atomic(kaddr); return 0; zeroit: - if (__ratelimit(&_rs)) - btrfs_warn(BTRFS_I(inode)->root->fs_info, - "csum failed ino %llu off %llu csum %u expected csum %u", + btrfs_warn_rl(BTRFS_I(inode)->root->fs_info, + "csum failed ino %llu off %llu csum %u expected csum %u", btrfs_ino(inode), start, csum, csum_expected); memset(kaddr + pgoff, 1, len); flush_dcache_page(page); @@ -3654,6 +3680,35 @@ cache_index: set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); + /* + * We don't persist the id of the transaction where an unlink operation + * against the inode was last made. So here we assume the inode might + * have been evicted, and therefore the exact value of last_unlink_trans + * lost, and set it to last_trans to avoid metadata inconsistencies + * between the inode and its parent if the inode is fsync'ed and the log + * replayed. For example, in the scenario: + * + * touch mydir/foo + * ln mydir/foo mydir/bar + * sync + * unlink mydir/bar + * echo 2 > /proc/sys/vm/drop_caches # evicts inode + * xfs_io -c fsync mydir/foo + * <power failure> + * mount fs, triggers fsync log replay + * + * We must make sure that when we fsync our inode foo we also log its + * parent inode, otherwise after log replay the parent still has the + * dentry with the "bar" name but our inode foo has a link count of 1 + * and doesn't have an inode ref with the name "bar" anymore. + * + * Setting last_unlink_trans to last_trans is a pessimistic approach, + * but it guarantees correctness at the expense of ocassional full + * transaction commits on fsync if our inode is a directory, or if our + * inode is not a directory, logging its parent unnecessarily. + */ + BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans; + path->slots[0]++; if (inode->i_nlink != 1 || path->slots[0] >= btrfs_header_nritems(leaf)) @@ -4184,6 +4239,47 @@ static int truncate_space_check(struct btrfs_trans_handle *trans, } +static int truncate_inline_extent(struct inode *inode, + struct btrfs_path *path, + struct btrfs_key *found_key, + const u64 item_end, + const u64 new_size) +{ + struct extent_buffer *leaf = path->nodes[0]; + int slot = path->slots[0]; + struct btrfs_file_extent_item *fi; + u32 size = (u32)(new_size - found_key->offset); + struct btrfs_root *root = BTRFS_I(inode)->root; + + fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); + + if (btrfs_file_extent_compression(leaf, fi) != BTRFS_COMPRESS_NONE) { + loff_t offset = new_size; + loff_t page_end = ALIGN(offset, PAGE_CACHE_SIZE); + + /* + * Zero out the remaining of the last page of our inline extent, + * instead of directly truncating our inline extent here - that + * would be much more complex (decompressing all the data, then + * compressing the truncated data, which might be bigger than + * the size of the inline extent, resize the extent, etc). + * We release the path because to get the page we might need to + * read the extent item from disk (data not in the page cache). + */ + btrfs_release_path(path); + return btrfs_truncate_page(inode, offset, page_end - offset, 0); + } + + btrfs_set_file_extent_ram_bytes(leaf, fi, size); + size = btrfs_file_extent_calc_inline_size(size); + btrfs_truncate_item(root, path, size, 1); + + if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) + inode_sub_bytes(inode, item_end + 1 - new_size); + + return 0; +} + /* * this can truncate away extent items, csum items and directory items. * It starts at a high offset and removes keys until it can't find @@ -4378,27 +4474,40 @@ search_again: * special encodings */ if (!del_item && - btrfs_file_extent_compression(leaf, fi) == 0 && btrfs_file_extent_encryption(leaf, fi) == 0 && btrfs_file_extent_other_encoding(leaf, fi) == 0) { - u32 size = new_size - found_key.offset; - - if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) - inode_sub_bytes(inode, item_end + 1 - - new_size); /* - * update the ram bytes to properly reflect - * the new size of our item + * Need to release path in order to truncate a + * compressed extent. So delete any accumulated + * extent items so far. */ - btrfs_set_file_extent_ram_bytes(leaf, fi, size); - size = - btrfs_file_extent_calc_inline_size(size); - btrfs_truncate_item(root, path, size, 1); + if (btrfs_file_extent_compression(leaf, fi) != + BTRFS_COMPRESS_NONE && pending_del_nr) { + err = btrfs_del_items(trans, root, path, + pending_del_slot, + pending_del_nr); + if (err) { + btrfs_abort_transaction(trans, + root, + err); + goto error; + } + pending_del_nr = 0; + } + + err = truncate_inline_extent(inode, path, + &found_key, + item_end, + new_size); + if (err) { + btrfs_abort_transaction(trans, + root, err); + goto error; + } } else if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) { - inode_sub_bytes(inode, item_end + 1 - - found_key.offset); + inode_sub_bytes(inode, item_end + 1 - new_size); } } delete: @@ -4428,7 +4537,7 @@ delete: ret = btrfs_free_extent(trans, root, extent_start, extent_num_bytes, 0, btrfs_header_owner(leaf), - ino, extent_offset, 0); + ino, extent_offset); BUG_ON(ret); if (btrfs_should_throttle_delayed_refs(trans, root)) btrfs_async_run_delayed_refs(root, @@ -4542,14 +4651,17 @@ int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len, if ((offset & (blocksize - 1)) == 0 && (!len || ((len & (blocksize - 1)) == 0))) goto out; - ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); + ret = btrfs_delalloc_reserve_space(inode, + round_down(from, PAGE_CACHE_SIZE), PAGE_CACHE_SIZE); if (ret) goto out; again: page = find_or_create_page(mapping, index, mask); if (!page) { - btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); + btrfs_delalloc_release_space(inode, + round_down(from, PAGE_CACHE_SIZE), + PAGE_CACHE_SIZE); ret = -ENOMEM; goto out; } @@ -4617,7 +4729,8 @@ again: out_unlock: if (ret) - btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); + btrfs_delalloc_release_space(inode, page_start, + PAGE_CACHE_SIZE); unlock_page(page); page_cache_release(page); out: @@ -5015,6 +5128,18 @@ static void evict_inode_truncate_pages(struct inode *inode) spin_unlock(&io_tree->lock); lock_extent_bits(io_tree, start, end, 0, &cached_state); + + /* + * If still has DELALLOC flag, the extent didn't reach disk, + * and its reserved space won't be freed by delayed_ref. + * So we need to free its reserved space here. + * (Refer to comment in btrfs_invalidatepage, case 2) + * + * Note, end is the bytenr of last byte, so we need + 1 here. + */ + if (state->state & EXTENT_DELALLOC) + btrfs_qgroup_free_data(inode, start, end - start + 1); + clear_extent_bit(io_tree, start, end, EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | @@ -5051,7 +5176,8 @@ void btrfs_evict_inode(struct inode *inode) goto no_delete; } /* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */ - btrfs_wait_ordered_range(inode, 0, (u64)-1); + if (!special_file(inode->i_mode)) + btrfs_wait_ordered_range(inode, 0, (u64)-1); btrfs_free_io_failure_record(inode, 0, (u64)-1); @@ -6876,8 +7002,7 @@ out: trace_btrfs_get_extent(root, em); - if (path) - btrfs_free_path(path); + btrfs_free_path(path); if (trans) { ret = btrfs_end_transaction(trans, root); if (!err) @@ -7376,6 +7501,10 @@ static struct extent_map *create_pinned_em(struct inode *inode, u64 start, return em; } +struct btrfs_dio_data { + u64 outstanding_extents; + u64 reserve; +}; static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) @@ -7383,10 +7512,10 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, struct extent_map *em; struct btrfs_root *root = BTRFS_I(inode)->root; struct extent_state *cached_state = NULL; + struct btrfs_dio_data *dio_data = NULL; u64 start = iblock << inode->i_blkbits; u64 lockstart, lockend; u64 len = bh_result->b_size; - u64 *outstanding_extents = NULL; int unlock_bits = EXTENT_LOCKED; int ret = 0; @@ -7404,7 +7533,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, * that anything that needs to check if there's a transction doesn't get * confused. */ - outstanding_extents = current->journal_info; + dio_data = current->journal_info; current->journal_info = NULL; } @@ -7536,17 +7665,18 @@ unlock: * within our reservation, otherwise we need to adjust our inode * counter appropriately. */ - if (*outstanding_extents) { - (*outstanding_extents)--; + if (dio_data->outstanding_extents) { + (dio_data->outstanding_extents)--; } else { spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->outstanding_extents++; spin_unlock(&BTRFS_I(inode)->lock); } - current->journal_info = outstanding_extents; - btrfs_free_reserved_data_space(inode, len); - set_bit(BTRFS_INODE_DIO_READY, &BTRFS_I(inode)->runtime_flags); + btrfs_free_reserved_data_space(inode, start, len); + WARN_ON(dio_data->reserve < len); + dio_data->reserve -= len; + current->journal_info = dio_data; } /* @@ -7569,8 +7699,8 @@ unlock: unlock_err: clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, unlock_bits, 1, 0, &cached_state, GFP_NOFS); - if (outstanding_extents) - current->journal_info = outstanding_extents; + if (dio_data) + current->journal_info = dio_data; return ret; } @@ -7688,13 +7818,13 @@ struct btrfs_retry_complete { int uptodate; }; -static void btrfs_retry_endio_nocsum(struct bio *bio, int err) +static void btrfs_retry_endio_nocsum(struct bio *bio) { struct btrfs_retry_complete *done = bio->bi_private; struct bio_vec *bvec; int i; - if (err) + if (bio->bi_error) goto end; done->uptodate = 1; @@ -7743,7 +7873,7 @@ try_again: return 0; } -static void btrfs_retry_endio(struct bio *bio, int err) +static void btrfs_retry_endio(struct bio *bio) { struct btrfs_retry_complete *done = bio->bi_private; struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); @@ -7752,7 +7882,7 @@ static void btrfs_retry_endio(struct bio *bio, int err) int ret; int i; - if (err) + if (bio->bi_error) goto end; uptodate = 1; @@ -7835,12 +7965,13 @@ static int btrfs_subio_endio_read(struct inode *inode, } } -static void btrfs_endio_direct_read(struct bio *bio, int err) +static void btrfs_endio_direct_read(struct bio *bio) { struct btrfs_dio_private *dip = bio->bi_private; struct inode *inode = dip->inode; struct bio *dio_bio; struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); + int err = bio->bi_error; if (dip->flags & BTRFS_DIO_ORIG_BIO_SUBMITTED) err = btrfs_subio_endio_read(inode, io_bio, err); @@ -7851,17 +7982,14 @@ static void btrfs_endio_direct_read(struct bio *bio, int err) kfree(dip); - /* If we had a csum failure make sure to clear the uptodate flag */ - if (err) - clear_bit(BIO_UPTODATE, &dio_bio->bi_flags); - dio_end_io(dio_bio, err); + dio_end_io(dio_bio, bio->bi_error); if (io_bio->end_io) io_bio->end_io(io_bio, err); bio_put(bio); } -static void btrfs_endio_direct_write(struct bio *bio, int err) +static void btrfs_endio_direct_write(struct bio *bio) { struct btrfs_dio_private *dip = bio->bi_private; struct inode *inode = dip->inode; @@ -7875,7 +8003,8 @@ static void btrfs_endio_direct_write(struct bio *bio, int err) again: ret = btrfs_dec_test_first_ordered_pending(inode, &ordered, &ordered_offset, - ordered_bytes, !err); + ordered_bytes, + !bio->bi_error); if (!ret) goto out_test; @@ -7898,10 +8027,7 @@ out_test: kfree(dip); - /* If we had an error make sure to clear the uptodate flag */ - if (err) - clear_bit(BIO_UPTODATE, &dio_bio->bi_flags); - dio_end_io(dio_bio, err); + dio_end_io(dio_bio, bio->bi_error); bio_put(bio); } @@ -7916,9 +8042,10 @@ static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw, return 0; } -static void btrfs_end_dio_bio(struct bio *bio, int err) +static void btrfs_end_dio_bio(struct bio *bio) { struct btrfs_dio_private *dip = bio->bi_private; + int err = bio->bi_error; if (err) btrfs_warn(BTRFS_I(dip->inode)->root->fs_info, @@ -7947,8 +8074,8 @@ static void btrfs_end_dio_bio(struct bio *bio, int err) if (dip->errors) { bio_io_error(dip->orig_bio); } else { - set_bit(BIO_UPTODATE, &dip->dio_bio->bi_flags); - bio_endio(dip->orig_bio, 0); + dip->dio_bio->bi_error = 0; + bio_endio(dip->orig_bio); } out: bio_put(bio); @@ -7957,8 +8084,11 @@ out: static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev, u64 first_sector, gfp_t gfp_flags) { - int nr_vecs = bio_get_nr_vecs(bdev); - return btrfs_bio_alloc(bdev, first_sector, nr_vecs, gfp_flags); + struct bio *bio; + bio = btrfs_bio_alloc(bdev, first_sector, BIO_MAX_PAGES, gfp_flags); + if (bio) + bio_associate_current(bio); + return bio; } static inline int btrfs_lookup_and_bind_dio_csum(struct btrfs_root *root, @@ -8219,7 +8349,8 @@ free_ordered: * callbacks - they require an allocated dip and a clone of dio_bio. */ if (io_bio && dip) { - bio_endio(io_bio, ret); + io_bio->bi_error = -EIO; + bio_endio(io_bio); /* * The end io callbacks free our dip, do the final put on io_bio * and all the cleanup and final put for dio_bio (through @@ -8246,7 +8377,7 @@ free_ordered: unlock_extent(&BTRFS_I(inode)->io_tree, file_offset, file_offset + dio_bio->bi_iter.bi_size - 1); } - clear_bit(BIO_UPTODATE, &dio_bio->bi_flags); + dio_bio->bi_error = -EIO; /* * Releases and cleans up our dio_bio, no need to bio_put() * nor bio_endio()/bio_io_error() against dio_bio. @@ -8296,7 +8427,8 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; - u64 outstanding_extents = 0; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_dio_data dio_data = { 0 }; size_t count = 0; int flags = 0; bool wakeup = true; @@ -8331,10 +8463,10 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, mutex_unlock(&inode->i_mutex); relock = true; } - ret = btrfs_delalloc_reserve_space(inode, count); + ret = btrfs_delalloc_reserve_space(inode, offset, count); if (ret) goto out; - outstanding_extents = div64_u64(count + + dio_data.outstanding_extents = div64_u64(count + BTRFS_MAX_EXTENT_SIZE - 1, BTRFS_MAX_EXTENT_SIZE); @@ -8343,7 +8475,8 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, * do the accounting properly if we go over the number we * originally calculated. Abuse current->journal_info for this. */ - current->journal_info = &outstanding_extents; + dio_data.reserve = round_up(count, root->sectorsize); + current->journal_info = &dio_data; } else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK, &BTRFS_I(inode)->runtime_flags)) { inode_dio_end(inode); @@ -8358,18 +8491,11 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, if (iov_iter_rw(iter) == WRITE) { current->journal_info = NULL; if (ret < 0 && ret != -EIOCBQUEUED) { - /* - * If the error comes from submitting stage, - * btrfs_get_blocsk_direct() has free'd data space, - * and metadata space will be handled by - * finish_ordered_fn, don't do that again to make - * sure bytes_may_use is correct. - */ - if (!test_and_clear_bit(BTRFS_INODE_DIO_READY, - &BTRFS_I(inode)->runtime_flags)) - btrfs_delalloc_release_space(inode, count); + if (dio_data.reserve) + btrfs_delalloc_release_space(inode, offset, + dio_data.reserve); } else if (ret >= 0 && (size_t)ret < count) - btrfs_delalloc_release_space(inode, + btrfs_delalloc_release_space(inode, offset, count - (size_t)ret); } out: @@ -8528,6 +8654,18 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, } } + /* + * Qgroup reserved space handler + * Page here will be either + * 1) Already written to disk + * In this case, its reserved space is released from data rsv map + * and will be freed by delayed_ref handler finally. + * So even we call qgroup_free_data(), it won't decrease reserved + * space. + * 2) Not written to disk + * This means the reserved space should be freed here. + */ + btrfs_qgroup_free_data(inode, page_start, PAGE_CACHE_SIZE); if (!inode_evicting) { clear_extent_bit(tree, page_start, page_end, EXTENT_LOCKED | EXTENT_DIRTY | @@ -8578,7 +8716,11 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) u64 page_end; sb_start_pagefault(inode->i_sb); - ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); + page_start = page_offset(page); + page_end = page_start + PAGE_CACHE_SIZE - 1; + + ret = btrfs_delalloc_reserve_space(inode, page_start, + PAGE_CACHE_SIZE); if (!ret) { ret = file_update_time(vma->vm_file); reserved = 1; @@ -8597,8 +8739,6 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) again: lock_page(page); size = i_size_read(inode); - page_start = page_offset(page); - page_end = page_start + PAGE_CACHE_SIZE - 1; if ((page->mapping != inode->i_mapping) || (page_start >= size)) { @@ -8675,7 +8815,7 @@ out_unlock: } unlock_page(page); out: - btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); + btrfs_delalloc_release_space(inode, page_start, PAGE_CACHE_SIZE); out_noreserve: sb_end_pagefault(inode->i_sb); return ret; @@ -8964,6 +9104,7 @@ void btrfs_destroy_inode(struct inode *inode) btrfs_put_ordered_extent(ordered); } } + btrfs_qgroup_check_reserved_leak(inode); inode_tree_del(inode); btrfs_drop_extent_cache(inode, 0, (u64)-1, 0); free: @@ -9600,6 +9741,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, u64 cur_offset = start; u64 i_size; u64 cur_bytes; + u64 last_alloc = (u64)-1; int ret = 0; bool own_trans = true; @@ -9616,6 +9758,13 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, cur_bytes = min(num_bytes, 256ULL * 1024 * 1024); cur_bytes = max(cur_bytes, min_size); + /* + * If we are severely fragmented we could end up with really + * small allocations, so if the allocator is returning small + * chunks lets make its job easier by only searching for those + * sized chunks. + */ + cur_bytes = min(cur_bytes, last_alloc); ret = btrfs_reserve_extent(root, cur_bytes, min_size, 0, *alloc_hint, &ins, 1, 0); if (ret) { @@ -9624,6 +9773,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, break; } + last_alloc = ins.offset; ret = insert_reserved_file_extent(trans, inode, cur_offset, ins.objectid, ins.offset, ins.offset, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 0770c91586ca..da94138eb85e 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1030,6 +1030,7 @@ static int should_defrag_range(struct inode *inode, u64 start, u32 thresh, struct extent_map *em; int ret = 1; bool next_mergeable = true; + bool prev_mergeable = true; /* * make sure that once we start defragging an extent, we keep on @@ -1050,13 +1051,16 @@ static int should_defrag_range(struct inode *inode, u64 start, u32 thresh, goto out; } + if (!*defrag_end) + prev_mergeable = false; + next_mergeable = defrag_check_next_extent(inode, em); /* * we hit a real extent, if it is big or the next extent is not a * real extent, don't bother defragging it */ if (!compress && (*last_len == 0 || *last_len >= thresh) && - (em->len >= thresh || !next_mergeable)) + (em->len >= thresh || (!next_mergeable && !prev_mergeable))) ret = 0; out: /* @@ -1116,7 +1120,8 @@ static int cluster_pages_for_defrag(struct inode *inode, page_cnt = min_t(u64, (u64)num_pages, (u64)file_end - start_index + 1); ret = btrfs_delalloc_reserve_space(inode, - page_cnt << PAGE_CACHE_SHIFT); + start_index << PAGE_CACHE_SHIFT, + page_cnt << PAGE_CACHE_SHIFT); if (ret) return ret; i_done = 0; @@ -1206,7 +1211,8 @@ again: BTRFS_I(inode)->outstanding_extents++; spin_unlock(&BTRFS_I(inode)->lock); btrfs_delalloc_release_space(inode, - (page_cnt - i_done) << PAGE_CACHE_SHIFT); + start_index << PAGE_CACHE_SHIFT, + (page_cnt - i_done) << PAGE_CACHE_SHIFT); } @@ -1231,7 +1237,9 @@ out: unlock_page(pages[i]); page_cache_release(pages[i]); } - btrfs_delalloc_release_space(inode, page_cnt << PAGE_CACHE_SHIFT); + btrfs_delalloc_release_space(inode, + start_index << PAGE_CACHE_SHIFT, + page_cnt << PAGE_CACHE_SHIFT); return ret; } @@ -1338,7 +1346,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, break; if (btrfs_defrag_cancelled(root->fs_info)) { - printk(KERN_DEBUG "BTRFS: defrag_file cancelled\n"); + btrfs_debug(root->fs_info, "defrag_file cancelled"); ret = -EAGAIN; break; } @@ -1575,7 +1583,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, new_size = div_u64(new_size, root->sectorsize); new_size *= root->sectorsize; - printk_in_rcu(KERN_INFO "BTRFS: new size for %s is %llu\n", + btrfs_info_in_rcu(root->fs_info, "new size for %s is %llu", rcu_str_deref(device->name), new_size); if (new_size > old_size) { @@ -1933,6 +1941,7 @@ static noinline int copy_to_sk(struct btrfs_root *root, u64 found_transid; struct extent_buffer *leaf; struct btrfs_ioctl_search_header sh; + struct btrfs_key test; unsigned long item_off; unsigned long item_len; int nritems; @@ -2016,12 +2025,17 @@ static noinline int copy_to_sk(struct btrfs_root *root, } advance_key: ret = 0; - if (key->offset < (u64)-1 && key->offset < sk->max_offset) + test.objectid = sk->max_objectid; + test.type = sk->max_type; + test.offset = sk->max_offset; + if (btrfs_comp_cpu_keys(key, &test) >= 0) + ret = 1; + else if (key->offset < (u64)-1) key->offset++; - else if (key->type < (u8)-1 && key->type < sk->max_type) { + else if (key->type < (u8)-1) { key->offset = 0; key->type++; - } else if (key->objectid < (u64)-1 && key->objectid < sk->max_objectid) { + } else if (key->objectid < (u64)-1) { key->offset = 0; key->type = 0; key->objectid++; @@ -2071,7 +2085,7 @@ static noinline int search_ioctl(struct inode *inode, key.offset = (u64)-1; root = btrfs_read_fs_root_no_name(info, &key); if (IS_ERR(root)) { - printk(KERN_ERR "BTRFS: could not find root %llu\n", + btrfs_err(info, "could not find root %llu", sk->tree_id); btrfs_free_path(path); return -ENOENT; @@ -2211,7 +2225,7 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info, key.offset = (u64)-1; root = btrfs_read_fs_root_no_name(info, &key); if (IS_ERR(root)) { - printk(KERN_ERR "BTRFS: could not find root %llu\n", tree_id); + btrfs_err(info, "could not find root %llu", tree_id); ret = -ENOENT; goto out; } @@ -2689,7 +2703,6 @@ static long btrfs_ioctl_fs_info(struct btrfs_root *root, void __user *arg) { struct btrfs_ioctl_fs_info_args *fi_args; struct btrfs_device *device; - struct btrfs_device *next; struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; int ret = 0; @@ -2701,7 +2714,7 @@ static long btrfs_ioctl_fs_info(struct btrfs_root *root, void __user *arg) fi_args->num_devices = fs_devices->num_devices; memcpy(&fi_args->fsid, root->fs_info->fsid, sizeof(fi_args->fsid)); - list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { + list_for_each_entry(device, &fs_devices->devices, dev_list) { if (device->devid > fi_args->max_id) fi_args->max_id = device->devid; } @@ -2842,8 +2855,7 @@ static void btrfs_double_inode_lock(struct inode *inode1, struct inode *inode2) swap(inode1, inode2); mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT); - if (inode1 != inode2) - mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD); + mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD); } static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1, @@ -2861,8 +2873,7 @@ static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1, swap(loff1, loff2); } lock_extent_range(inode1, loff1, len); - if (inode1 != inode2) - lock_extent_range(inode2, loff2, len); + lock_extent_range(inode2, loff2, len); } struct cmp_pages { @@ -3195,41 +3206,6 @@ out: return ret; } -/* Helper to check and see if this root currently has a ref on the given disk - * bytenr. If it does then we need to update the quota for this root. This - * doesn't do anything if quotas aren't enabled. - */ -static int check_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - u64 disko) -{ - struct seq_list tree_mod_seq_elem = SEQ_LIST_INIT(tree_mod_seq_elem); - struct ulist *roots; - struct ulist_iterator uiter; - struct ulist_node *root_node = NULL; - int ret; - - if (!root->fs_info->quota_enabled) - return 1; - - btrfs_get_tree_mod_seq(root->fs_info, &tree_mod_seq_elem); - ret = btrfs_find_all_roots(trans, root->fs_info, disko, - tree_mod_seq_elem.seq, &roots); - if (ret < 0) - goto out; - ret = 0; - ULIST_ITER_INIT(&uiter); - while ((root_node = ulist_next(roots, &uiter))) { - if (root_node->val == root->objectid) { - ret = 1; - break; - } - } - ulist_free(roots); -out: - btrfs_put_tree_mod_seq(root->fs_info, &tree_mod_seq_elem); - return ret; -} - static int clone_finish_inode_update(struct btrfs_trans_handle *trans, struct inode *inode, u64 endoff, @@ -3320,6 +3296,150 @@ static void clone_update_extent_map(struct inode *inode, &BTRFS_I(inode)->runtime_flags); } +/* + * Make sure we do not end up inserting an inline extent into a file that has + * already other (non-inline) extents. If a file has an inline extent it can + * not have any other extents and the (single) inline extent must start at the + * file offset 0. Failing to respect these rules will lead to file corruption, + * resulting in EIO errors on read/write operations, hitting BUG_ON's in mm, etc + * + * We can have extents that have been already written to disk or we can have + * dirty ranges still in delalloc, in which case the extent maps and items are + * created only when we run delalloc, and the delalloc ranges might fall outside + * the range we are currently locking in the inode's io tree. So we check the + * inode's i_size because of that (i_size updates are done while holding the + * i_mutex, which we are holding here). + * We also check to see if the inode has a size not greater than "datal" but has + * extents beyond it, due to an fallocate with FALLOC_FL_KEEP_SIZE (and we are + * protected against such concurrent fallocate calls by the i_mutex). + * + * If the file has no extents but a size greater than datal, do not allow the + * copy because we would need turn the inline extent into a non-inline one (even + * with NO_HOLES enabled). If we find our destination inode only has one inline + * extent, just overwrite it with the source inline extent if its size is less + * than the source extent's size, or we could copy the source inline extent's + * data into the destination inode's inline extent if the later is greater then + * the former. + */ +static int clone_copy_inline_extent(struct inode *src, + struct inode *dst, + struct btrfs_trans_handle *trans, + struct btrfs_path *path, + struct btrfs_key *new_key, + const u64 drop_start, + const u64 datal, + const u64 skip, + const u64 size, + char *inline_data) +{ + struct btrfs_root *root = BTRFS_I(dst)->root; + const u64 aligned_end = ALIGN(new_key->offset + datal, + root->sectorsize); + int ret; + struct btrfs_key key; + + if (new_key->offset > 0) + return -EOPNOTSUPP; + + key.objectid = btrfs_ino(dst); + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = 0; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) { + return ret; + } else if (ret > 0) { + if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + return ret; + else if (ret > 0) + goto copy_inline_extent; + } + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if (key.objectid == btrfs_ino(dst) && + key.type == BTRFS_EXTENT_DATA_KEY) { + ASSERT(key.offset > 0); + return -EOPNOTSUPP; + } + } else if (i_size_read(dst) <= datal) { + struct btrfs_file_extent_item *ei; + u64 ext_len; + + /* + * If the file size is <= datal, make sure there are no other + * extents following (can happen do to an fallocate call with + * the flag FALLOC_FL_KEEP_SIZE). + */ + ei = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_file_extent_item); + /* + * If it's an inline extent, it can not have other extents + * following it. + */ + if (btrfs_file_extent_type(path->nodes[0], ei) == + BTRFS_FILE_EXTENT_INLINE) + goto copy_inline_extent; + + ext_len = btrfs_file_extent_num_bytes(path->nodes[0], ei); + if (ext_len > aligned_end) + return -EOPNOTSUPP; + + ret = btrfs_next_item(root, path); + if (ret < 0) { + return ret; + } else if (ret == 0) { + btrfs_item_key_to_cpu(path->nodes[0], &key, + path->slots[0]); + if (key.objectid == btrfs_ino(dst) && + key.type == BTRFS_EXTENT_DATA_KEY) + return -EOPNOTSUPP; + } + } + +copy_inline_extent: + /* + * We have no extent items, or we have an extent at offset 0 which may + * or may not be inlined. All these cases are dealt the same way. + */ + if (i_size_read(dst) > datal) { + /* + * If the destination inode has an inline extent... + * This would require copying the data from the source inline + * extent into the beginning of the destination's inline extent. + * But this is really complex, both extents can be compressed + * or just one of them, which would require decompressing and + * re-compressing data (which could increase the new compressed + * size, not allowing the compressed data to fit anymore in an + * inline extent). + * So just don't support this case for now (it should be rare, + * we are not really saving space when cloning inline extents). + */ + return -EOPNOTSUPP; + } + + btrfs_release_path(path); + ret = btrfs_drop_extents(trans, root, dst, drop_start, aligned_end, 1); + if (ret) + return ret; + ret = btrfs_insert_empty_item(trans, root, path, new_key, size); + if (ret) + return ret; + + if (skip) { + const u32 start = btrfs_file_extent_calc_inline_size(0); + + memmove(inline_data + start, inline_data + start + skip, datal); + } + + write_extent_buffer(path->nodes[0], inline_data, + btrfs_item_ptr_offset(path->nodes[0], + path->slots[0]), + size); + inode_add_bytes(dst, datal); + + return 0; +} + /** * btrfs_clone() - clone a range from inode file to another * @@ -3344,9 +3464,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode, u32 nritems; int slot; int ret; - int no_quota; const u64 len = olen_aligned; - u64 last_disko = 0; u64 last_dest_end = destoff; ret = -ENOMEM; @@ -3392,7 +3510,6 @@ static int btrfs_clone(struct inode *src, struct inode *inode, nritems = btrfs_header_nritems(path->nodes[0]); process_slot: - no_quota = 1; if (path->slots[0] >= nritems) { ret = btrfs_next_leaf(BTRFS_I(src)->root, path); if (ret < 0) @@ -3544,35 +3661,13 @@ process_slot: btrfs_set_file_extent_num_bytes(leaf, extent, datal); - /* - * We need to look up the roots that point at - * this bytenr and see if the new root does. If - * it does not we need to make sure we update - * quotas appropriately. - */ - if (disko && root != BTRFS_I(src)->root && - disko != last_disko) { - no_quota = check_ref(trans, root, - disko); - if (no_quota < 0) { - btrfs_abort_transaction(trans, - root, - ret); - btrfs_end_transaction(trans, - root); - ret = no_quota; - goto out; - } - } - if (disko) { inode_add_bytes(inode, datal); ret = btrfs_inc_extent_ref(trans, root, disko, diskl, 0, root->root_key.objectid, btrfs_ino(inode), - new_key.offset - datao, - no_quota); + new_key.offset - datao); if (ret) { btrfs_abort_transaction(trans, root, @@ -3586,21 +3681,6 @@ process_slot: } else if (type == BTRFS_FILE_EXTENT_INLINE) { u64 skip = 0; u64 trim = 0; - u64 aligned_end = 0; - - /* - * Don't copy an inline extent into an offset - * greater than zero. Having an inline extent - * at such an offset results in chaos as btrfs - * isn't prepared for such cases. Just skip - * this case for the same reasons as commented - * at btrfs_ioctl_clone(). - */ - if (last_dest_end > 0) { - ret = -EOPNOTSUPP; - btrfs_end_transaction(trans, root); - goto out; - } if (off > key.offset) { skip = off - key.offset; @@ -3618,42 +3698,22 @@ process_slot: size -= skip + trim; datal -= skip + trim; - aligned_end = ALIGN(new_key.offset + datal, - root->sectorsize); - ret = btrfs_drop_extents(trans, root, inode, - drop_start, - aligned_end, - 1); + ret = clone_copy_inline_extent(src, inode, + trans, path, + &new_key, + drop_start, + datal, + skip, size, buf); if (ret) { if (ret != -EOPNOTSUPP) btrfs_abort_transaction(trans, - root, ret); - btrfs_end_transaction(trans, root); - goto out; - } - - ret = btrfs_insert_empty_item(trans, root, path, - &new_key, size); - if (ret) { - btrfs_abort_transaction(trans, root, - ret); + root, + ret); btrfs_end_transaction(trans, root); goto out; } - - if (skip) { - u32 start = - btrfs_file_extent_calc_inline_size(0); - memmove(buf+start, buf+start+skip, - datal); - } - leaf = path->nodes[0]; slot = path->slots[0]; - write_extent_buffer(leaf, buf, - btrfs_item_ptr_offset(leaf, slot), - size); - inode_add_bytes(inode, datal); } /* If we have an implicit hole (NO_HOLES feature). */ @@ -3787,13 +3847,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, goto out_fput; if (!same_inode) { - if (inode < src) { - mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT); - mutex_lock_nested(&src->i_mutex, I_MUTEX_CHILD); - } else { - mutex_lock_nested(&src->i_mutex, I_MUTEX_PARENT); - mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD); - } + btrfs_double_inode_lock(src, inode); } else { mutex_lock(&src->i_mutex); } @@ -3843,8 +3897,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, lock_extent_range(src, lock_start, lock_len); } else { - lock_extent_range(src, off, len); - lock_extent_range(inode, destoff, len); + btrfs_double_extent_lock(src, off, inode, destoff, len); } ret = btrfs_clone(src, inode, off, olen, len, destoff, 0); @@ -3855,9 +3908,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, unlock_extent(&BTRFS_I(src)->io_tree, lock_start, lock_end); } else { - unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1); - unlock_extent(&BTRFS_I(inode)->io_tree, destoff, - destoff + len - 1); + btrfs_double_extent_unlock(src, off, inode, destoff, len); } /* * Truncate page cache pages so that future reads will see the cloned @@ -3866,17 +3917,10 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, truncate_inode_pages_range(&inode->i_data, destoff, PAGE_CACHE_ALIGN(destoff + len) - 1); out_unlock: - if (!same_inode) { - if (inode < src) { - mutex_unlock(&src->i_mutex); - mutex_unlock(&inode->i_mutex); - } else { - mutex_unlock(&inode->i_mutex); - mutex_unlock(&src->i_mutex); - } - } else { + if (!same_inode) + btrfs_double_inode_unlock(src, inode); + else mutex_unlock(&src->i_mutex); - } out_fput: fdput(src_file); out_drop_write: @@ -4647,6 +4691,11 @@ locked: bctl->flags |= BTRFS_BALANCE_TYPE_MASK; } + if (bctl->flags & ~(BTRFS_BALANCE_ARGS_MASK | BTRFS_BALANCE_TYPE_MASK)) { + ret = -EINVAL; + goto out_bctl; + } + do_balance: /* * Ownership of bctl and mutually_exclusive_operation_running @@ -4658,12 +4707,15 @@ do_balance: need_unlock = false; ret = btrfs_balance(bctl, bargs); + bctl = NULL; if (arg) { if (copy_to_user(arg, bargs, sizeof(*bargs))) ret = -EFAULT; } +out_bctl: + kfree(bctl); out_bargs: kfree(bargs); out_unlock: @@ -4814,7 +4866,7 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg) /* update qgroup status and info */ err = btrfs_run_qgroups(trans, root->fs_info); if (err < 0) - btrfs_error(root->fs_info, ret, + btrfs_std_error(root->fs_info, ret, "failed to update qgroup status and info\n"); err = btrfs_end_transaction(trans, root); if (err && !ret) diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index f8229ef1b46d..8077461fc56a 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -79,6 +79,9 @@ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw) write_lock(&eb->lock); WARN_ON(atomic_read(&eb->spinning_writers)); atomic_inc(&eb->spinning_writers); + /* + * atomic_dec_and_test implies a barrier for waitqueue_active + */ if (atomic_dec_and_test(&eb->blocking_writers) && waitqueue_active(&eb->write_lock_wq)) wake_up(&eb->write_lock_wq); @@ -86,6 +89,9 @@ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw) BUG_ON(atomic_read(&eb->blocking_readers) == 0); read_lock(&eb->lock); atomic_inc(&eb->spinning_readers); + /* + * atomic_dec_and_test implies a barrier for waitqueue_active + */ if (atomic_dec_and_test(&eb->blocking_readers) && waitqueue_active(&eb->read_lock_wq)) wake_up(&eb->read_lock_wq); @@ -229,6 +235,9 @@ void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb) } btrfs_assert_tree_read_locked(eb); WARN_ON(atomic_read(&eb->blocking_readers) == 0); + /* + * atomic_dec_and_test implies a barrier for waitqueue_active + */ if (atomic_dec_and_test(&eb->blocking_readers) && waitqueue_active(&eb->read_lock_wq)) wake_up(&eb->read_lock_wq); @@ -241,6 +250,7 @@ void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb) */ void btrfs_tree_lock(struct extent_buffer *eb) { + WARN_ON(eb->lock_owner == current->pid); again: wait_event(eb->read_lock_wq, atomic_read(&eb->blocking_readers) == 0); wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0); @@ -279,6 +289,9 @@ void btrfs_tree_unlock(struct extent_buffer *eb) if (blockers) { WARN_ON(atomic_read(&eb->spinning_writers)); atomic_dec(&eb->blocking_writers); + /* + * Make sure counter is updated before we wake up waiters. + */ smp_mb(); if (waitqueue_active(&eb->write_lock_wq)) wake_up(&eb->write_lock_wq); diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 52170cf1757e..8c27292ea9ea 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -345,6 +345,9 @@ int btrfs_dec_test_first_ordered_pending(struct inode *inode, if (entry->bytes_left == 0) { ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); + /* + * Implicit memory barrier after test_and_set_bit + */ if (waitqueue_active(&entry->wait)) wake_up(&entry->wait); } else { @@ -409,6 +412,9 @@ have_entry: if (entry->bytes_left == 0) { ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); + /* + * Implicit memory barrier after test_and_set_bit + */ if (waitqueue_active(&entry->wait)) wake_up(&entry->wait); } else { @@ -484,15 +490,16 @@ void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans, spin_lock_irq(&log->log_extents_lock[index]); while (!list_empty(&log->logged_list[index])) { + struct inode *inode; ordered = list_first_entry(&log->logged_list[index], struct btrfs_ordered_extent, log_list); list_del_init(&ordered->log_list); + inode = ordered->inode; spin_unlock_irq(&log->log_extents_lock[index]); if (!test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) && !test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) { - struct inode *inode = ordered->inode; u64 start = ordered->file_offset; u64 end = ordered->file_offset + ordered->len - 1; @@ -503,20 +510,25 @@ void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans, &ordered->flags)); /* - * If our ordered extent completed it means it updated the - * fs/subvol and csum trees already, so no need to make the - * current transaction's commit wait for it, as we end up - * holding memory unnecessarily and delaying the inode's iput - * until the transaction commit (we schedule an iput for the - * inode when the ordered extent's refcount drops to 0), which - * prevents it from being evictable until the transaction - * commits. + * In order to keep us from losing our ordered extent + * information when committing the transaction we have to make + * sure that any logged extents are completed when we go to + * commit the transaction. To do this we simply increase the + * current transactions pending_ordered counter and decrement it + * when the ordered extent completes. */ - if (test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) - btrfs_put_ordered_extent(ordered); - else - list_add_tail(&ordered->trans_list, &trans->ordered); - + if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) { + struct btrfs_ordered_inode_tree *tree; + + tree = &BTRFS_I(inode)->ordered_tree; + spin_lock_irq(&tree->lock); + if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) { + set_bit(BTRFS_ORDERED_PENDING, &ordered->flags); + atomic_inc(&trans->transaction->pending_ordered); + } + spin_unlock_irq(&tree->lock); + } + btrfs_put_ordered_extent(ordered); spin_lock_irq(&log->log_extents_lock[index]); } spin_unlock_irq(&log->log_extents_lock[index]); @@ -578,6 +590,7 @@ void btrfs_remove_ordered_extent(struct inode *inode, struct btrfs_ordered_inode_tree *tree; struct btrfs_root *root = BTRFS_I(inode)->root; struct rb_node *node; + bool dec_pending_ordered = false; tree = &BTRFS_I(inode)->ordered_tree; spin_lock_irq(&tree->lock); @@ -587,8 +600,37 @@ void btrfs_remove_ordered_extent(struct inode *inode, if (tree->last == node) tree->last = NULL; set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); + if (test_and_clear_bit(BTRFS_ORDERED_PENDING, &entry->flags)) + dec_pending_ordered = true; spin_unlock_irq(&tree->lock); + /* + * The current running transaction is waiting on us, we need to let it + * know that we're complete and wake it up. + */ + if (dec_pending_ordered) { + struct btrfs_transaction *trans; + + /* + * The checks for trans are just a formality, it should be set, + * but if it isn't we don't want to deref/assert under the spin + * lock, so be nice and check if trans is set, but ASSERT() so + * if it isn't set a developer will notice. + */ + spin_lock(&root->fs_info->trans_lock); + trans = root->fs_info->running_transaction; + if (trans) + atomic_inc(&trans->use_count); + spin_unlock(&root->fs_info->trans_lock); + + ASSERT(trans); + if (trans) { + if (atomic_dec_and_test(&trans->pending_ordered)) + wake_up(&trans->pending_wait); + btrfs_put_transaction(trans); + } + } + spin_lock(&root->ordered_extent_lock); list_del_init(&entry->root_extent_list); root->nr_ordered_extents--; diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 7176cc0fe43f..23c96059cef2 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -73,6 +73,8 @@ struct btrfs_ordered_sum { #define BTRFS_ORDERED_LOGGED 10 /* Set when we've waited on this ordered extent * in the logging code. */ +#define BTRFS_ORDERED_PENDING 11 /* We are waiting for this ordered extent to + * complete in the current transaction. */ struct btrfs_ordered_extent { /* logical offset in the file */ u64 file_offset; diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index dca137b04095..f9e60231f685 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -49,18 +49,16 @@ static struct prop_handler prop_handlers[] = { .extract = prop_compression_extract, .inheritable = 1 }, - { - .xattr_name = NULL - } }; void __init btrfs_props_init(void) { - struct prop_handler *p; + int i; hash_init(prop_handlers_ht); - for (p = &prop_handlers[0]; p->xattr_name; p++) { + for (i = 0; i < ARRAY_SIZE(prop_handlers); i++) { + struct prop_handler *p = &prop_handlers[i]; u64 h = btrfs_name_hash(p->xattr_name, strlen(p->xattr_name)); hash_add(prop_handlers_ht, &p->node, h); @@ -301,15 +299,16 @@ static int inherit_props(struct btrfs_trans_handle *trans, struct inode *inode, struct inode *parent) { - const struct prop_handler *h; struct btrfs_root *root = BTRFS_I(inode)->root; int ret; + int i; if (!test_bit(BTRFS_INODE_HAS_PROPS, &BTRFS_I(parent)->runtime_flags)) return 0; - for (h = &prop_handlers[0]; h->xattr_name; h++) { + for (i = 0; i < ARRAY_SIZE(prop_handlers); i++) { + const struct prop_handler *h = &prop_handlers[i]; const char *value; u64 num_bytes; diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 8a8202956576..46476c226395 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -376,7 +376,7 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info) qgroup = find_qgroup_rb(fs_info, found_key.offset); if ((qgroup && found_key.type == BTRFS_QGROUP_INFO_KEY) || (!qgroup && found_key.type == BTRFS_QGROUP_LIMIT_KEY)) { - btrfs_err(fs_info, "inconsitent qgroup config"); + btrfs_err(fs_info, "inconsistent qgroup config"); flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; } if (!qgroup) { @@ -1652,10 +1652,6 @@ static int qgroup_update_counters(struct btrfs_fs_info *fs_info, } } - /* For exclusive extent, free its reserved bytes too */ - if (nr_old_roots == 0 && nr_new_roots == 1 && - cur_new_count == nr_new_roots) - qg->reserved -= num_bytes; if (dirty) qgroup_dirty(fs_info, qg); } @@ -2035,7 +2031,7 @@ out: return ret; } -int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes) +static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes) { struct btrfs_root *quota_root; struct btrfs_qgroup *qgroup; @@ -2116,14 +2112,13 @@ out: return ret; } -void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes) +void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info, + u64 ref_root, u64 num_bytes) { struct btrfs_root *quota_root; struct btrfs_qgroup *qgroup; - struct btrfs_fs_info *fs_info = root->fs_info; struct ulist_node *unode; struct ulist_iterator uiter; - u64 ref_root = root->root_key.objectid; int ret = 0; if (!is_fstree(ref_root)) @@ -2169,6 +2164,11 @@ out: spin_unlock(&fs_info->qgroup_lock); } +static inline void qgroup_free(struct btrfs_root *root, u64 num_bytes) +{ + return btrfs_qgroup_free_refroot(root->fs_info, root->objectid, + num_bytes); +} void assert_qgroups_uptodate(struct btrfs_trans_handle *trans) { if (list_empty(&trans->qgroup_ref_list) && !trans->delayed_ref_elem.seq) @@ -2188,10 +2188,10 @@ void assert_qgroups_uptodate(struct btrfs_trans_handle *trans) */ static int qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path, - struct btrfs_trans_handle *trans, - struct extent_buffer *scratch_leaf) + struct btrfs_trans_handle *trans) { struct btrfs_key found; + struct extent_buffer *scratch_leaf = NULL; struct ulist *roots = NULL; struct seq_list tree_mod_seq_elem = SEQ_LIST_INIT(tree_mod_seq_elem); u64 num_bytes; @@ -2229,7 +2229,15 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path, fs_info->qgroup_rescan_progress.objectid = found.objectid + 1; btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem); - memcpy(scratch_leaf, path->nodes[0], sizeof(*scratch_leaf)); + scratch_leaf = btrfs_clone_extent_buffer(path->nodes[0]); + if (!scratch_leaf) { + ret = -ENOMEM; + mutex_unlock(&fs_info->qgroup_rescan_lock); + goto out; + } + extent_buffer_get(scratch_leaf); + btrfs_tree_read_lock(scratch_leaf); + btrfs_set_lock_blocking_rw(scratch_leaf, BTRFS_READ_LOCK); slot = path->slots[0]; btrfs_release_path(path); mutex_unlock(&fs_info->qgroup_rescan_lock); @@ -2255,6 +2263,10 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path, goto out; } out: + if (scratch_leaf) { + btrfs_tree_read_unlock_blocking(scratch_leaf); + free_extent_buffer(scratch_leaf); + } btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem); return ret; @@ -2266,16 +2278,12 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) qgroup_rescan_work); struct btrfs_path *path; struct btrfs_trans_handle *trans = NULL; - struct extent_buffer *scratch_leaf = NULL; int err = -ENOMEM; int ret = 0; path = btrfs_alloc_path(); if (!path) goto out; - scratch_leaf = kmalloc(sizeof(*scratch_leaf), GFP_NOFS); - if (!scratch_leaf) - goto out; err = 0; while (!err) { @@ -2287,8 +2295,7 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) if (!fs_info->quota_enabled) { err = -EINTR; } else { - err = qgroup_rescan_leaf(fs_info, path, trans, - scratch_leaf); + err = qgroup_rescan_leaf(fs_info, path, trans); } if (err > 0) btrfs_commit_transaction(trans, fs_info->fs_root); @@ -2297,7 +2304,6 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) } out: - kfree(scratch_leaf); btrfs_free_path(path); mutex_lock(&fs_info->qgroup_rescan_lock); @@ -2486,3 +2492,190 @@ btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info) btrfs_queue_work(fs_info->qgroup_rescan_workers, &fs_info->qgroup_rescan_work); } + +/* + * Reserve qgroup space for range [start, start + len). + * + * This function will either reserve space from related qgroups or doing + * nothing if the range is already reserved. + * + * Return 0 for successful reserve + * Return <0 for error (including -EQUOT) + * + * NOTE: this function may sleep for memory allocation. + */ +int btrfs_qgroup_reserve_data(struct inode *inode, u64 start, u64 len) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct extent_changeset changeset; + struct ulist_node *unode; + struct ulist_iterator uiter; + int ret; + + if (!root->fs_info->quota_enabled || !is_fstree(root->objectid) || + len == 0) + return 0; + + changeset.bytes_changed = 0; + changeset.range_changed = ulist_alloc(GFP_NOFS); + ret = set_record_extent_bits(&BTRFS_I(inode)->io_tree, start, + start + len -1, EXTENT_QGROUP_RESERVED, GFP_NOFS, + &changeset); + trace_btrfs_qgroup_reserve_data(inode, start, len, + changeset.bytes_changed, + QGROUP_RESERVE); + if (ret < 0) + goto cleanup; + ret = qgroup_reserve(root, changeset.bytes_changed); + if (ret < 0) + goto cleanup; + + ulist_free(changeset.range_changed); + return ret; + +cleanup: + /* cleanup already reserved ranges */ + ULIST_ITER_INIT(&uiter); + while ((unode = ulist_next(changeset.range_changed, &uiter))) + clear_extent_bit(&BTRFS_I(inode)->io_tree, unode->val, + unode->aux, EXTENT_QGROUP_RESERVED, 0, 0, NULL, + GFP_NOFS); + ulist_free(changeset.range_changed); + return ret; +} + +static int __btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len, + int free) +{ + struct extent_changeset changeset; + int trace_op = QGROUP_RELEASE; + int ret; + + changeset.bytes_changed = 0; + changeset.range_changed = ulist_alloc(GFP_NOFS); + if (!changeset.range_changed) + return -ENOMEM; + + ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, start, + start + len -1, EXTENT_QGROUP_RESERVED, GFP_NOFS, + &changeset); + if (ret < 0) + goto out; + + if (free) { + qgroup_free(BTRFS_I(inode)->root, changeset.bytes_changed); + trace_op = QGROUP_FREE; + } + trace_btrfs_qgroup_release_data(inode, start, len, + changeset.bytes_changed, trace_op); +out: + ulist_free(changeset.range_changed); + return ret; +} + +/* + * Free a reserved space range from io_tree and related qgroups + * + * Should be called when a range of pages get invalidated before reaching disk. + * Or for error cleanup case. + * + * For data written to disk, use btrfs_qgroup_release_data(). + * + * NOTE: This function may sleep for memory allocation. + */ +int btrfs_qgroup_free_data(struct inode *inode, u64 start, u64 len) +{ + return __btrfs_qgroup_release_data(inode, start, len, 1); +} + +/* + * Release a reserved space range from io_tree only. + * + * Should be called when a range of pages get written to disk and corresponding + * FILE_EXTENT is inserted into corresponding root. + * + * Since new qgroup accounting framework will only update qgroup numbers at + * commit_transaction() time, its reserved space shouldn't be freed from + * related qgroups. + * + * But we should release the range from io_tree, to allow further write to be + * COWed. + * + * NOTE: This function may sleep for memory allocation. + */ +int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len) +{ + return __btrfs_qgroup_release_data(inode, start, len, 0); +} + +int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes) +{ + int ret; + + if (!root->fs_info->quota_enabled || !is_fstree(root->objectid) || + num_bytes == 0) + return 0; + + BUG_ON(num_bytes != round_down(num_bytes, root->nodesize)); + ret = qgroup_reserve(root, num_bytes); + if (ret < 0) + return ret; + atomic_add(num_bytes, &root->qgroup_meta_rsv); + return ret; +} + +void btrfs_qgroup_free_meta_all(struct btrfs_root *root) +{ + int reserved; + + if (!root->fs_info->quota_enabled || !is_fstree(root->objectid)) + return; + + reserved = atomic_xchg(&root->qgroup_meta_rsv, 0); + if (reserved == 0) + return; + qgroup_free(root, reserved); +} + +void btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes) +{ + if (!root->fs_info->quota_enabled || !is_fstree(root->objectid)) + return; + + BUG_ON(num_bytes != round_down(num_bytes, root->nodesize)); + WARN_ON(atomic_read(&root->qgroup_meta_rsv) < num_bytes); + atomic_sub(num_bytes, &root->qgroup_meta_rsv); + qgroup_free(root, num_bytes); +} + +/* + * Check qgroup reserved space leaking, normally at destory inode + * time + */ +void btrfs_qgroup_check_reserved_leak(struct inode *inode) +{ + struct extent_changeset changeset; + struct ulist_node *unode; + struct ulist_iterator iter; + int ret; + + changeset.bytes_changed = 0; + changeset.range_changed = ulist_alloc(GFP_NOFS); + if (WARN_ON(!changeset.range_changed)) + return; + + ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, 0, (u64)-1, + EXTENT_QGROUP_RESERVED, GFP_NOFS, &changeset); + + WARN_ON(ret < 0); + if (WARN_ON(changeset.bytes_changed)) { + ULIST_ITER_INIT(&iter); + while ((unode = ulist_next(changeset.range_changed, &iter))) { + btrfs_warn(BTRFS_I(inode)->root->fs_info, + "leaking qgroup reserved space, ino: %lu, start: %llu, end: %llu", + inode->i_ino, unode->val, unode->aux); + } + qgroup_free(BTRFS_I(inode)->root, changeset.bytes_changed); + } + ulist_free(changeset.range_changed); +} diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index 6387dcfa354c..ecb2c143ef75 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -33,6 +33,13 @@ struct btrfs_qgroup_extent_record { struct ulist *old_roots; }; +/* + * For qgroup event trace points only + */ +#define QGROUP_RESERVE (1<<0) +#define QGROUP_RELEASE (1<<1) +#define QGROUP_FREE (1<<2) + int btrfs_quota_enable(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); int btrfs_quota_disable(struct btrfs_trans_handle *trans, @@ -71,9 +78,18 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans, int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid, struct btrfs_qgroup_inherit *inherit); -int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes); -void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes); - +void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info, + u64 ref_root, u64 num_bytes); +/* + * TODO: Add proper trace point for it, as btrfs_qgroup_free() is + * called by everywhere, can't provide good trace for delayed ref case. + */ +static inline void btrfs_qgroup_free_delayed_ref(struct btrfs_fs_info *fs_info, + u64 ref_root, u64 num_bytes) +{ + btrfs_qgroup_free_refroot(fs_info, ref_root, num_bytes); + trace_btrfs_qgroup_free_delayed_ref(ref_root, num_bytes); +} void assert_qgroups_uptodate(struct btrfs_trans_handle *trans); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS @@ -81,4 +97,13 @@ int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid, u64 rfer, u64 excl); #endif +/* New io_tree based accurate qgroup reserve API */ +int btrfs_qgroup_reserve_data(struct inode *inode, u64 start, u64 len); +int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len); +int btrfs_qgroup_free_data(struct inode *inode, u64 start, u64 len); + +int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes); +void btrfs_qgroup_free_meta_all(struct btrfs_root *root); +void btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes); +void btrfs_qgroup_check_reserved_leak(struct inode *inode); #endif /* __BTRFS_QGROUP__ */ diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index fa72068bd256..1a33d3eb36de 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -61,9 +61,10 @@ #define RBIO_CACHE_SIZE 1024 enum btrfs_rbio_ops { - BTRFS_RBIO_WRITE = 0, - BTRFS_RBIO_READ_REBUILD = 1, - BTRFS_RBIO_PARITY_SCRUB = 2, + BTRFS_RBIO_WRITE, + BTRFS_RBIO_READ_REBUILD, + BTRFS_RBIO_PARITY_SCRUB, + BTRFS_RBIO_REBUILD_MISSING, }; struct btrfs_raid_bio { @@ -602,6 +603,10 @@ static int rbio_can_merge(struct btrfs_raid_bio *last, cur->operation == BTRFS_RBIO_PARITY_SCRUB) return 0; + if (last->operation == BTRFS_RBIO_REBUILD_MISSING || + cur->operation == BTRFS_RBIO_REBUILD_MISSING) + return 0; + return 1; } @@ -793,7 +798,10 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) if (next->operation == BTRFS_RBIO_READ_REBUILD) async_read_rebuild(next); - else if (next->operation == BTRFS_RBIO_WRITE) { + else if (next->operation == BTRFS_RBIO_REBUILD_MISSING) { + steal_rbio(rbio, next); + async_read_rebuild(next); + } else if (next->operation == BTRFS_RBIO_WRITE) { steal_rbio(rbio, next); async_rmw_stripe(next); } else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) { @@ -802,7 +810,11 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) } goto done_nolock; - } else if (waitqueue_active(&h->wait)) { + /* + * The barrier for this waitqueue_active is not needed, + * we're protected by h->lock and can't miss a wakeup. + */ + } else if (waitqueue_active(&h->wait)) { spin_unlock(&rbio->bio_list_lock); spin_unlock_irqrestore(&h->lock, flags); wake_up(&h->wait); @@ -851,7 +863,7 @@ static void free_raid_bio(struct btrfs_raid_bio *rbio) * this frees the rbio and runs through all the bios in the * bio_list and calls end_io on them */ -static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate) +static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err) { struct bio *cur = bio_list_get(&rbio->bio_list); struct bio *next; @@ -864,9 +876,8 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate) while (cur) { next = cur->bi_next; cur->bi_next = NULL; - if (uptodate) - set_bit(BIO_UPTODATE, &cur->bi_flags); - bio_endio(cur, err); + cur->bi_error = err; + bio_endio(cur); cur = next; } } @@ -875,9 +886,10 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate) * end io function used by finish_rmw. When we finally * get here, we've written a full stripe */ -static void raid_write_end_io(struct bio *bio, int err) +static void raid_write_end_io(struct bio *bio) { struct btrfs_raid_bio *rbio = bio->bi_private; + int err = bio->bi_error; if (err) fail_bio_stripe(rbio, bio); @@ -893,7 +905,7 @@ static void raid_write_end_io(struct bio *bio, int err) if (atomic_read(&rbio->error) > rbio->bbio->max_errors) err = -EIO; - rbio_orig_end_io(rbio, err, 0); + rbio_orig_end_io(rbio, err); return; } @@ -1071,7 +1083,7 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio, * devices or if they are not contiguous */ if (last_end == disk_start && stripe->dev->bdev && - test_bit(BIO_UPTODATE, &last->bi_flags) && + !last->bi_error && last->bi_bdev == stripe->dev->bdev) { ret = bio_add_page(last, page, PAGE_CACHE_SIZE, 0); if (ret == PAGE_CACHE_SIZE) @@ -1087,7 +1099,6 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio, bio->bi_iter.bi_size = 0; bio->bi_bdev = stripe->dev->bdev; bio->bi_iter.bi_sector = disk_start >> 9; - set_bit(BIO_UPTODATE, &bio->bi_flags); bio_add_page(bio, page, PAGE_CACHE_SIZE, 0); bio_list_add(bio_list, bio); @@ -1312,13 +1323,12 @@ write_data: bio->bi_private = rbio; bio->bi_end_io = raid_write_end_io; - BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags)); submit_bio(WRITE, bio); } return; cleanup: - rbio_orig_end_io(rbio, -EIO, 0); + rbio_orig_end_io(rbio, -EIO); } /* @@ -1441,11 +1451,11 @@ static void set_bio_pages_uptodate(struct bio *bio) * This will usually kick off finish_rmw once all the bios are read in, but it * may trigger parity reconstruction if we had any errors along the way */ -static void raid_rmw_end_io(struct bio *bio, int err) +static void raid_rmw_end_io(struct bio *bio) { struct btrfs_raid_bio *rbio = bio->bi_private; - if (err) + if (bio->bi_error) fail_bio_stripe(rbio, bio); else set_bio_pages_uptodate(bio); @@ -1455,7 +1465,6 @@ static void raid_rmw_end_io(struct bio *bio, int err) if (!atomic_dec_and_test(&rbio->stripes_pending)) return; - err = 0; if (atomic_read(&rbio->error) > rbio->bbio->max_errors) goto cleanup; @@ -1469,7 +1478,7 @@ static void raid_rmw_end_io(struct bio *bio, int err) cleanup: - rbio_orig_end_io(rbio, -EIO, 0); + rbio_orig_end_io(rbio, -EIO); } static void async_rmw_stripe(struct btrfs_raid_bio *rbio) @@ -1572,14 +1581,13 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); - BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags)); submit_bio(READ, bio); } /* the actual write will happen once the reads are done */ return 0; cleanup: - rbio_orig_end_io(rbio, -EIO, 0); + rbio_orig_end_io(rbio, -EIO); return -EIO; finish: @@ -1809,7 +1817,8 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) faila = rbio->faila; failb = rbio->failb; - if (rbio->operation == BTRFS_RBIO_READ_REBUILD) { + if (rbio->operation == BTRFS_RBIO_READ_REBUILD || + rbio->operation == BTRFS_RBIO_REBUILD_MISSING) { spin_lock_irq(&rbio->bio_list_lock); set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); spin_unlock_irq(&rbio->bio_list_lock); @@ -1834,7 +1843,8 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) * if we're rebuilding a read, we have to use * pages from the bio list */ - if (rbio->operation == BTRFS_RBIO_READ_REBUILD && + if ((rbio->operation == BTRFS_RBIO_READ_REBUILD || + rbio->operation == BTRFS_RBIO_REBUILD_MISSING) && (stripe == faila || stripe == failb)) { page = page_in_rbio(rbio, stripe, pagenr, 0); } else { @@ -1943,7 +1953,8 @@ pstripe: * if we're rebuilding a read, we have to use * pages from the bio list */ - if (rbio->operation == BTRFS_RBIO_READ_REBUILD && + if ((rbio->operation == BTRFS_RBIO_READ_REBUILD || + rbio->operation == BTRFS_RBIO_REBUILD_MISSING) && (stripe == faila || stripe == failb)) { page = page_in_rbio(rbio, stripe, pagenr, 0); } else { @@ -1964,7 +1975,9 @@ cleanup_io: else clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); - rbio_orig_end_io(rbio, err, err == 0); + rbio_orig_end_io(rbio, err); + } else if (rbio->operation == BTRFS_RBIO_REBUILD_MISSING) { + rbio_orig_end_io(rbio, err); } else if (err == 0) { rbio->faila = -1; rbio->failb = -1; @@ -1976,7 +1989,7 @@ cleanup_io: else BUG(); } else { - rbio_orig_end_io(rbio, err, 0); + rbio_orig_end_io(rbio, err); } } @@ -1984,7 +1997,7 @@ cleanup_io: * This is called only for stripes we've read from disk to * reconstruct the parity. */ -static void raid_recover_end_io(struct bio *bio, int err) +static void raid_recover_end_io(struct bio *bio) { struct btrfs_raid_bio *rbio = bio->bi_private; @@ -1992,7 +2005,7 @@ static void raid_recover_end_io(struct bio *bio, int err) * we only read stripe pages off the disk, set them * up to date if there were no errors */ - if (err) + if (bio->bi_error) fail_bio_stripe(rbio, bio); else set_bio_pages_uptodate(bio); @@ -2002,7 +2015,7 @@ static void raid_recover_end_io(struct bio *bio, int err) return; if (atomic_read(&rbio->error) > rbio->bbio->max_errors) - rbio_orig_end_io(rbio, -EIO, 0); + rbio_orig_end_io(rbio, -EIO); else __raid_recover_end_io(rbio); } @@ -2094,15 +2107,15 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); - BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags)); submit_bio(READ, bio); } out: return 0; cleanup: - if (rbio->operation == BTRFS_RBIO_READ_REBUILD) - rbio_orig_end_io(rbio, -EIO, 0); + if (rbio->operation == BTRFS_RBIO_READ_REBUILD || + rbio->operation == BTRFS_RBIO_REBUILD_MISSING) + rbio_orig_end_io(rbio, -EIO); return -EIO; } @@ -2232,8 +2245,9 @@ raid56_parity_alloc_scrub_rbio(struct btrfs_root *root, struct bio *bio, return rbio; } -void raid56_parity_add_scrub_pages(struct btrfs_raid_bio *rbio, - struct page *page, u64 logical) +/* Used for both parity scrub and missing. */ +void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page, + u64 logical) { int stripe_offset; int index; @@ -2277,11 +2291,12 @@ static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio) * end io function used by finish_rmw. When we finally * get here, we've written a full stripe */ -static void raid_write_parity_end_io(struct bio *bio, int err) +static void raid_write_parity_end_io(struct bio *bio) { struct btrfs_raid_bio *rbio = bio->bi_private; + int err = bio->bi_error; - if (err) + if (bio->bi_error) fail_bio_stripe(rbio, bio); bio_put(bio); @@ -2294,7 +2309,7 @@ static void raid_write_parity_end_io(struct bio *bio, int err) if (atomic_read(&rbio->error)) err = -EIO; - rbio_orig_end_io(rbio, err, 0); + rbio_orig_end_io(rbio, err); } static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, @@ -2437,7 +2452,7 @@ submit_write: nr_data = bio_list_size(&bio_list); if (!nr_data) { /* Every parity is right */ - rbio_orig_end_io(rbio, 0, 0); + rbio_orig_end_io(rbio, 0); return; } @@ -2450,13 +2465,12 @@ submit_write: bio->bi_private = rbio; bio->bi_end_io = raid_write_parity_end_io; - BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags)); submit_bio(WRITE, bio); } return; cleanup: - rbio_orig_end_io(rbio, -EIO, 0); + rbio_orig_end_io(rbio, -EIO); } static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe) @@ -2524,7 +2538,7 @@ static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio) return; cleanup: - rbio_orig_end_io(rbio, -EIO, 0); + rbio_orig_end_io(rbio, -EIO); } /* @@ -2535,11 +2549,11 @@ cleanup: * This will usually kick off finish_rmw once all the bios are read in, but it * may trigger parity reconstruction if we had any errors along the way */ -static void raid56_parity_scrub_end_io(struct bio *bio, int err) +static void raid56_parity_scrub_end_io(struct bio *bio) { struct btrfs_raid_bio *rbio = bio->bi_private; - if (err) + if (bio->bi_error) fail_bio_stripe(rbio, bio); else set_bio_pages_uptodate(bio); @@ -2632,14 +2646,13 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); - BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags)); submit_bio(READ, bio); } /* the actual write will happen once the reads are done */ return; cleanup: - rbio_orig_end_io(rbio, -EIO, 0); + rbio_orig_end_io(rbio, -EIO); return; finish: @@ -2668,3 +2681,55 @@ void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio) if (!lock_stripe_add(rbio)) async_scrub_parity(rbio); } + +/* The following code is used for dev replace of a missing RAID 5/6 device. */ + +struct btrfs_raid_bio * +raid56_alloc_missing_rbio(struct btrfs_root *root, struct bio *bio, + struct btrfs_bio *bbio, u64 length) +{ + struct btrfs_raid_bio *rbio; + + rbio = alloc_rbio(root, bbio, length); + if (IS_ERR(rbio)) + return NULL; + + rbio->operation = BTRFS_RBIO_REBUILD_MISSING; + bio_list_add(&rbio->bio_list, bio); + /* + * This is a special bio which is used to hold the completion handler + * and make the scrub rbio is similar to the other types + */ + ASSERT(!bio->bi_iter.bi_size); + + rbio->faila = find_logical_bio_stripe(rbio, bio); + if (rbio->faila == -1) { + BUG(); + kfree(rbio); + return NULL; + } + + return rbio; +} + +static void missing_raid56_work(struct btrfs_work *work) +{ + struct btrfs_raid_bio *rbio; + + rbio = container_of(work, struct btrfs_raid_bio, work); + __raid56_parity_recover(rbio); +} + +static void async_missing_raid56(struct btrfs_raid_bio *rbio) +{ + btrfs_init_work(&rbio->work, btrfs_rmw_helper, + missing_raid56_work, NULL, NULL); + + btrfs_queue_work(rbio->fs_info->rmw_workers, &rbio->work); +} + +void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio) +{ + if (!lock_stripe_add(rbio)) + async_missing_raid56(rbio); +} diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h index 2b5d7977d83b..8b694699d502 100644 --- a/fs/btrfs/raid56.h +++ b/fs/btrfs/raid56.h @@ -48,15 +48,21 @@ int raid56_parity_recover(struct btrfs_root *root, struct bio *bio, int raid56_parity_write(struct btrfs_root *root, struct bio *bio, struct btrfs_bio *bbio, u64 stripe_len); +void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page, + u64 logical); + struct btrfs_raid_bio * raid56_parity_alloc_scrub_rbio(struct btrfs_root *root, struct bio *bio, struct btrfs_bio *bbio, u64 stripe_len, struct btrfs_device *scrub_dev, unsigned long *dbitmap, int stripe_nsectors); -void raid56_parity_add_scrub_pages(struct btrfs_raid_bio *rbio, - struct page *page, u64 logical); void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio); +struct btrfs_raid_bio * +raid56_alloc_missing_rbio(struct btrfs_root *root, struct bio *bio, + struct btrfs_bio *bbio, u64 length); +void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio); + int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info); void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info); #endif diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index 0e7beea92b4c..619f92963e27 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -328,6 +328,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, struct btrfs_device *prev_dev; u32 blocksize; u64 length; + int real_stripes; int nzones = 0; int i; unsigned long index = logical >> PAGE_CACHE_SHIFT; @@ -369,7 +370,8 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, goto error; } - for (nzones = 0; nzones < bbio->num_stripes; ++nzones) { + real_stripes = bbio->num_stripes - bbio->num_tgtdevs; + for (nzones = 0; nzones < real_stripes; ++nzones) { struct reada_zone *zone; dev = bbio->stripes[nzones].dev; @@ -567,7 +569,7 @@ static int reada_add_block(struct reada_control *rc, u64 logical, rec = kzalloc(sizeof(*rec), GFP_NOFS); if (!rec) { reada_extent_put(root->fs_info, re); - return -1; + return -ENOMEM; } rec->rc = rc; @@ -916,6 +918,7 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root, u64 start; u64 generation; int level; + int ret; struct extent_buffer *node; static struct btrfs_key max_key = { .objectid = (u64)-1, @@ -941,9 +944,10 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root, generation = btrfs_header_generation(node); free_extent_buffer(node); - if (reada_add_block(rc, start, &max_key, level, generation)) { + ret = reada_add_block(rc, start, &max_key, level, generation); + if (ret) { kfree(rc); - return ERR_PTR(-ENOMEM); + return ERR_PTR(ret); } reada_start_machine(root->fs_info); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 88cbb5995667..b4ca5454ef1a 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1716,7 +1716,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans, ret = btrfs_inc_extent_ref(trans, root, new_bytenr, num_bytes, parent, btrfs_header_owner(leaf), - key.objectid, key.offset, 1); + key.objectid, key.offset); if (ret) { btrfs_abort_transaction(trans, root, ret); break; @@ -1724,7 +1724,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans, ret = btrfs_free_extent(trans, root, bytenr, num_bytes, parent, btrfs_header_owner(leaf), - key.objectid, key.offset, 1); + key.objectid, key.offset); if (ret) { btrfs_abort_transaction(trans, root, ret); break; @@ -1900,23 +1900,21 @@ again: ret = btrfs_inc_extent_ref(trans, src, old_bytenr, blocksize, path->nodes[level]->start, - src->root_key.objectid, level - 1, 0, - 1); + src->root_key.objectid, level - 1, 0); BUG_ON(ret); ret = btrfs_inc_extent_ref(trans, dest, new_bytenr, blocksize, 0, dest->root_key.objectid, level - 1, - 0, 1); + 0); BUG_ON(ret); ret = btrfs_free_extent(trans, src, new_bytenr, blocksize, path->nodes[level]->start, - src->root_key.objectid, level - 1, 0, - 1); + src->root_key.objectid, level - 1, 0); BUG_ON(ret); ret = btrfs_free_extent(trans, dest, old_bytenr, blocksize, 0, dest->root_key.objectid, level - 1, - 0, 1); + 0); BUG_ON(ret); btrfs_unlock_up_safe(path, 0); @@ -2418,7 +2416,7 @@ again: } out: if (ret) { - btrfs_std_error(root->fs_info, ret); + btrfs_std_error(root->fs_info, ret, NULL); if (!list_empty(&reloc_roots)) free_reloc_roots(&reloc_roots); @@ -2523,8 +2521,7 @@ struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans, * counted. return -ENOENT if the block is root of reloc tree. */ static noinline_for_stack -struct btrfs_root *select_one_root(struct btrfs_trans_handle *trans, - struct backref_node *node) +struct btrfs_root *select_one_root(struct backref_node *node) { struct backref_node *next; struct btrfs_root *root; @@ -2746,7 +2743,7 @@ static int do_relocation(struct btrfs_trans_handle *trans, node->eb->start, blocksize, upper->eb->start, btrfs_header_owner(upper->eb), - node->level, 0, 1); + node->level, 0); BUG_ON(ret); ret = btrfs_drop_subtree(trans, root, eb, upper->eb); @@ -2912,7 +2909,7 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans, return 0; BUG_ON(node->processed); - root = select_one_root(trans, node); + root = select_one_root(node); if (root == ERR_PTR(-ENOENT)) { update_processed_blocks(rc, node); goto out; @@ -3035,8 +3032,8 @@ int prealloc_file_extent_cluster(struct inode *inode, BUG_ON(cluster->start != cluster->boundary[0]); mutex_lock(&inode->i_mutex); - ret = btrfs_check_data_free_space(inode, cluster->end + - 1 - cluster->start, 0); + ret = btrfs_check_data_free_space(inode, cluster->start, + cluster->end + 1 - cluster->start); if (ret) goto out; @@ -3057,8 +3054,8 @@ int prealloc_file_extent_cluster(struct inode *inode, break; nr++; } - btrfs_free_reserved_data_space(inode, cluster->end + - 1 - cluster->start); + btrfs_free_reserved_data_space(inode, cluster->start, + cluster->end + 1 - cluster->start); out: mutex_unlock(&inode->i_mutex); return ret; @@ -3755,8 +3752,7 @@ out: * helper to find next unprocessed extent */ static noinline_for_stack -int find_next_extent(struct btrfs_trans_handle *trans, - struct reloc_control *rc, struct btrfs_path *path, +int find_next_extent(struct reloc_control *rc, struct btrfs_path *path, struct btrfs_key *extent_key) { struct btrfs_key key; @@ -3951,7 +3947,7 @@ restart: continue; } - ret = find_next_extent(trans, rc, path, &key); + ret = find_next_extent(rc, path, &key); if (ret < 0) err = ret; if (ret != 0) @@ -3976,6 +3972,10 @@ restart: sizeof(struct btrfs_extent_item_v0)); ret = get_ref_objectid_v0(rc, path, &key, &ref_owner, &path_change); + if (ret < 0) { + err = ret; + break; + } if (ref_owner < BTRFS_FIRST_FREE_OBJECTID) flags = BTRFS_EXTENT_FLAG_TREE_BLOCK; else @@ -4140,7 +4140,7 @@ struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans; struct btrfs_root *root; struct btrfs_key key; - u64 objectid = BTRFS_FIRST_FREE_OBJECTID; + u64 objectid; int err = 0; root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID); @@ -4215,14 +4215,12 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) rc->block_group = btrfs_lookup_block_group(fs_info, group_start); BUG_ON(!rc->block_group); - if (!rc->block_group->ro) { - ret = btrfs_set_block_group_ro(extent_root, rc->block_group); - if (ret) { - err = ret; - goto out; - } - rw = 1; + ret = btrfs_inc_block_group_ro(extent_root, rc->block_group); + if (ret) { + err = ret; + goto out; } + rw = 1; path = btrfs_alloc_path(); if (!path) { @@ -4294,7 +4292,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0); out: if (err && rw) - btrfs_set_block_group_rw(extent_root, rc->block_group); + btrfs_dec_block_group_ro(extent_root, rc->block_group); iput(rc->data_inode); btrfs_put_block_group(rc->block_group); kfree(rc); @@ -4594,8 +4592,7 @@ int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, * called before creating snapshot. it calculates metadata reservation * requried for relocating tree blocks in the snapshot */ -void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans, - struct btrfs_pending_snapshot *pending, +void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending, u64 *bytes_to_reserve) { struct btrfs_root *root; diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 360a728a639f..7cf8509deda7 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -45,12 +45,13 @@ static void btrfs_read_root_item(struct extent_buffer *eb, int slot, if (!need_reset && btrfs_root_generation(item) != btrfs_root_generation_v2(item)) { if (btrfs_root_generation_v2(item) != 0) { - printk(KERN_WARNING "BTRFS: mismatching " + btrfs_warn(eb->fs_info, + "mismatching " "generation and generation_v2 " "found in root item. This root " "was probably mounted with an " "older kernel. Resetting all " - "new fields.\n"); + "new fields."); } need_reset = 1; } @@ -141,7 +142,7 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root int ret; int slot; unsigned long ptr; - int old_len; + u32 old_len; path = btrfs_alloc_path(); if (!path) @@ -283,7 +284,7 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root) trans = btrfs_join_transaction(tree_root); if (IS_ERR(trans)) { err = PTR_ERR(trans); - btrfs_error(tree_root->fs_info, err, + btrfs_std_error(tree_root->fs_info, err, "Failed to start trans to delete " "orphan item"); break; @@ -292,7 +293,7 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root) root_key.objectid); btrfs_end_transaction(trans, tree_root); if (err) { - btrfs_error(tree_root->fs_info, err, + btrfs_std_error(tree_root->fs_info, err, "Failed to delete root orphan " "item"); break; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 94db0fa5225a..550de89a8661 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -125,6 +125,7 @@ struct scrub_block { /* It is for the data with checksum */ unsigned int data_corrected:1; }; + struct btrfs_work work; }; /* Used for the chunks with parity stripe such RAID5/6 */ @@ -278,7 +279,7 @@ static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, u64 physical, struct btrfs_device *dev, u64 flags, u64 gen, int mirror_num, u8 *csum, int force, u64 physical_for_dev_replace); -static void scrub_bio_end_io(struct bio *bio, int err); +static void scrub_bio_end_io(struct bio *bio); static void scrub_bio_end_io_worker(struct btrfs_work *work); static void scrub_block_complete(struct scrub_block *sblock); static void scrub_remap_extent(struct btrfs_fs_info *fs_info, @@ -295,7 +296,7 @@ static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx); static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx, struct scrub_page *spage); static void scrub_wr_submit(struct scrub_ctx *sctx); -static void scrub_wr_bio_end_io(struct bio *bio, int err); +static void scrub_wr_bio_end_io(struct bio *bio); static void scrub_wr_bio_end_io_worker(struct btrfs_work *work); static int write_page_nocow(struct scrub_ctx *sctx, u64 physical_for_dev_replace, struct page *page); @@ -332,11 +333,14 @@ static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info) } } -static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info) +static void scrub_pause_on(struct btrfs_fs_info *fs_info) { atomic_inc(&fs_info->scrubs_paused); wake_up(&fs_info->scrub_pause_wait); +} +static void scrub_pause_off(struct btrfs_fs_info *fs_info) +{ mutex_lock(&fs_info->scrub_lock); __scrub_blocked_if_needed(fs_info); atomic_dec(&fs_info->scrubs_paused); @@ -345,6 +349,12 @@ static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info) wake_up(&fs_info->scrub_pause_wait); } +static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info) +{ + scrub_pause_on(fs_info); + scrub_pause_off(fs_info); +} + /* * used for workers that require transaction commits (i.e., for the * NOCOW case) @@ -454,27 +464,14 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace) struct scrub_ctx *sctx; int i; struct btrfs_fs_info *fs_info = dev->dev_root->fs_info; - int pages_per_rd_bio; int ret; - /* - * the setting of pages_per_rd_bio is correct for scrub but might - * be wrong for the dev_replace code where we might read from - * different devices in the initial huge bios. However, that - * code is able to correctly handle the case when adding a page - * to a bio fails. - */ - if (dev->bdev) - pages_per_rd_bio = min_t(int, SCRUB_PAGES_PER_RD_BIO, - bio_get_nr_vecs(dev->bdev)); - else - pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO; sctx = kzalloc(sizeof(*sctx), GFP_NOFS); if (!sctx) goto nomem; atomic_set(&sctx->refs, 1); sctx->is_dev_replace = is_dev_replace; - sctx->pages_per_rd_bio = pages_per_rd_bio; + sctx->pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO; sctx->curr = -1; sctx->dev_root = dev->dev_root; for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) { @@ -583,9 +580,9 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, * hold all of the paths here */ for (i = 0; i < ipath->fspath->elem_cnt; ++i) - printk_in_rcu(KERN_WARNING "BTRFS: %s at logical %llu on dev " + btrfs_warn_in_rcu(fs_info, "%s at logical %llu on dev " "%s, sector %llu, root %llu, inode %llu, offset %llu, " - "length %llu, links %u (path: %s)\n", swarn->errstr, + "length %llu, links %u (path: %s)", swarn->errstr, swarn->logical, rcu_str_deref(swarn->dev->name), (unsigned long long)swarn->sector, root, inum, offset, min(isize - offset, (u64)PAGE_SIZE), nlink, @@ -595,9 +592,9 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, return 0; err: - printk_in_rcu(KERN_WARNING "BTRFS: %s at logical %llu on dev " + btrfs_warn_in_rcu(fs_info, "%s at logical %llu on dev " "%s, sector %llu, root %llu, inode %llu, offset %llu: path " - "resolving failed with ret=%d\n", swarn->errstr, + "resolving failed with ret=%d", swarn->errstr, swarn->logical, rcu_str_deref(swarn->dev->name), (unsigned long long)swarn->sector, root, inum, offset, ret); @@ -652,10 +649,10 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) ret = tree_backref_for_extent(&ptr, eb, &found_key, ei, item_size, &ref_root, &ref_level); - printk_in_rcu(KERN_WARNING - "BTRFS: %s at logical %llu on dev %s, " + btrfs_warn_in_rcu(fs_info, + "%s at logical %llu on dev %s, " "sector %llu: metadata %s (level %d) in tree " - "%llu\n", errstr, swarn.logical, + "%llu", errstr, swarn.logical, rcu_str_deref(dev->name), (unsigned long long)swarn.sector, ref_level ? "node" : "leaf", @@ -853,8 +850,8 @@ out: btrfs_dev_replace_stats_inc( &sctx->dev_root->fs_info->dev_replace. num_uncorrectable_read_errors); - printk_ratelimited_in_rcu(KERN_ERR "BTRFS: " - "unable to fixup (nodatasum) error at logical %llu on dev %s\n", + btrfs_err_rl_in_rcu(sctx->dev_root->fs_info, + "unable to fixup (nodatasum) error at logical %llu on dev %s", fixup->logical, rcu_str_deref(fixup->dev->name)); } @@ -1233,8 +1230,8 @@ corrected_error: sctx->stat.corrected_errors++; sblock_to_check->data_corrected = 1; spin_unlock(&sctx->stat_lock); - printk_ratelimited_in_rcu(KERN_ERR - "BTRFS: fixed up error at logical %llu on dev %s\n", + btrfs_err_rl_in_rcu(fs_info, + "fixed up error at logical %llu on dev %s", logical, rcu_str_deref(dev->name)); } } else { @@ -1242,8 +1239,8 @@ did_not_correct_error: spin_lock(&sctx->stat_lock); sctx->stat.uncorrectable_errors++; spin_unlock(&sctx->stat_lock); - printk_ratelimited_in_rcu(KERN_ERR - "BTRFS: unable to fixup (regular) error at logical %llu on dev %s\n", + btrfs_err_rl_in_rcu(fs_info, + "unable to fixup (regular) error at logical %llu on dev %s", logical, rcu_str_deref(dev->name)); } @@ -1429,11 +1426,11 @@ struct scrub_bio_ret { int error; }; -static void scrub_bio_wait_endio(struct bio *bio, int error) +static void scrub_bio_wait_endio(struct bio *bio) { struct scrub_bio_ret *ret = bio->bi_private; - ret->error = error; + ret->error = bio->bi_error; complete(&ret->event); } @@ -1629,9 +1626,9 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, int ret; if (!page_bad->dev->bdev) { - printk_ratelimited(KERN_WARNING "BTRFS: " + btrfs_warn_rl(sblock_bad->sctx->dev_root->fs_info, "scrub_repair_page_from_good_copy(bdev == NULL) " - "is unexpected!\n"); + "is unexpected"); return -EIO; } @@ -1790,12 +1787,12 @@ static void scrub_wr_submit(struct scrub_ctx *sctx) btrfsic_submit_bio(WRITE, sbio->bio); } -static void scrub_wr_bio_end_io(struct bio *bio, int err) +static void scrub_wr_bio_end_io(struct bio *bio) { struct scrub_bio *sbio = bio->bi_private; struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info; - sbio->err = err; + sbio->err = bio->bi_error; sbio->bio = bio; btrfs_init_work(&sbio->work, btrfs_scrubwrc_helper, @@ -2087,21 +2084,7 @@ static void scrub_submit(struct scrub_ctx *sctx) sbio = sctx->bios[sctx->curr]; sctx->curr = -1; scrub_pending_bio_inc(sctx); - - if (!sbio->bio->bi_bdev) { - /* - * this case should not happen. If btrfs_map_block() is - * wrong, it could happen for dev-replace operations on - * missing devices when no mirrors are available, but in - * this case it should already fail the mount. - * This case is handled correctly (but _very_ slowly). - */ - printk_ratelimited(KERN_WARNING - "BTRFS: scrub_submit(bio bdev == NULL) is unexpected!\n"); - bio_endio(sbio->bio, -EIO); - } else { - btrfsic_submit_bio(READ, sbio->bio); - } + btrfsic_submit_bio(READ, sbio->bio); } static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx, @@ -2178,6 +2161,134 @@ again: return 0; } +static void scrub_missing_raid56_end_io(struct bio *bio) +{ + struct scrub_block *sblock = bio->bi_private; + struct btrfs_fs_info *fs_info = sblock->sctx->dev_root->fs_info; + + if (bio->bi_error) + sblock->no_io_error_seen = 0; + + btrfs_queue_work(fs_info->scrub_workers, &sblock->work); +} + +static void scrub_missing_raid56_worker(struct btrfs_work *work) +{ + struct scrub_block *sblock = container_of(work, struct scrub_block, work); + struct scrub_ctx *sctx = sblock->sctx; + struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; + unsigned int is_metadata; + unsigned int have_csum; + u8 *csum; + u64 generation; + u64 logical; + struct btrfs_device *dev; + + is_metadata = !(sblock->pagev[0]->flags & BTRFS_EXTENT_FLAG_DATA); + have_csum = sblock->pagev[0]->have_csum; + csum = sblock->pagev[0]->csum; + generation = sblock->pagev[0]->generation; + logical = sblock->pagev[0]->logical; + dev = sblock->pagev[0]->dev; + + if (sblock->no_io_error_seen) { + scrub_recheck_block_checksum(fs_info, sblock, is_metadata, + have_csum, csum, generation, + sctx->csum_size); + } + + if (!sblock->no_io_error_seen) { + spin_lock(&sctx->stat_lock); + sctx->stat.read_errors++; + spin_unlock(&sctx->stat_lock); + btrfs_err_rl_in_rcu(fs_info, + "IO error rebuilding logical %llu for dev %s", + logical, rcu_str_deref(dev->name)); + } else if (sblock->header_error || sblock->checksum_error) { + spin_lock(&sctx->stat_lock); + sctx->stat.uncorrectable_errors++; + spin_unlock(&sctx->stat_lock); + btrfs_err_rl_in_rcu(fs_info, + "failed to rebuild valid logical %llu for dev %s", + logical, rcu_str_deref(dev->name)); + } else { + scrub_write_block_to_dev_replace(sblock); + } + + scrub_block_put(sblock); + + if (sctx->is_dev_replace && + atomic_read(&sctx->wr_ctx.flush_all_writes)) { + mutex_lock(&sctx->wr_ctx.wr_lock); + scrub_wr_submit(sctx); + mutex_unlock(&sctx->wr_ctx.wr_lock); + } + + scrub_pending_bio_dec(sctx); +} + +static void scrub_missing_raid56_pages(struct scrub_block *sblock) +{ + struct scrub_ctx *sctx = sblock->sctx; + struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; + u64 length = sblock->page_count * PAGE_SIZE; + u64 logical = sblock->pagev[0]->logical; + struct btrfs_bio *bbio; + struct bio *bio; + struct btrfs_raid_bio *rbio; + int ret; + int i; + + ret = btrfs_map_sblock(fs_info, REQ_GET_READ_MIRRORS, logical, &length, + &bbio, 0, 1); + if (ret || !bbio || !bbio->raid_map) + goto bbio_out; + + if (WARN_ON(!sctx->is_dev_replace || + !(bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK))) { + /* + * We shouldn't be scrubbing a missing device. Even for dev + * replace, we should only get here for RAID 5/6. We either + * managed to mount something with no mirrors remaining or + * there's a bug in scrub_remap_extent()/btrfs_map_block(). + */ + goto bbio_out; + } + + bio = btrfs_io_bio_alloc(GFP_NOFS, 0); + if (!bio) + goto bbio_out; + + bio->bi_iter.bi_sector = logical >> 9; + bio->bi_private = sblock; + bio->bi_end_io = scrub_missing_raid56_end_io; + + rbio = raid56_alloc_missing_rbio(sctx->dev_root, bio, bbio, length); + if (!rbio) + goto rbio_out; + + for (i = 0; i < sblock->page_count; i++) { + struct scrub_page *spage = sblock->pagev[i]; + + raid56_add_scrub_pages(rbio, spage->page, spage->logical); + } + + btrfs_init_work(&sblock->work, btrfs_scrub_helper, + scrub_missing_raid56_worker, NULL, NULL); + scrub_block_get(sblock); + scrub_pending_bio_inc(sctx); + raid56_submit_missing_rbio(rbio); + return; + +rbio_out: + bio_put(bio); +bbio_out: + btrfs_put_bbio(bbio); + spin_lock(&sctx->stat_lock); + sctx->stat.malloc_errors++; + spin_unlock(&sctx->stat_lock); +} + static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, u64 physical, struct btrfs_device *dev, u64 flags, u64 gen, int mirror_num, u8 *csum, int force, @@ -2241,31 +2352,39 @@ leave_nomem: } WARN_ON(sblock->page_count == 0); - for (index = 0; index < sblock->page_count; index++) { - struct scrub_page *spage = sblock->pagev[index]; - int ret; + if (dev->missing) { + /* + * This case should only be hit for RAID 5/6 device replace. See + * the comment in scrub_missing_raid56_pages() for details. + */ + scrub_missing_raid56_pages(sblock); + } else { + for (index = 0; index < sblock->page_count; index++) { + struct scrub_page *spage = sblock->pagev[index]; + int ret; - ret = scrub_add_page_to_rd_bio(sctx, spage); - if (ret) { - scrub_block_put(sblock); - return ret; + ret = scrub_add_page_to_rd_bio(sctx, spage); + if (ret) { + scrub_block_put(sblock); + return ret; + } } - } - if (force) - scrub_submit(sctx); + if (force) + scrub_submit(sctx); + } /* last one frees, either here or in bio completion for last page */ scrub_block_put(sblock); return 0; } -static void scrub_bio_end_io(struct bio *bio, int err) +static void scrub_bio_end_io(struct bio *bio) { struct scrub_bio *sbio = bio->bi_private; struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info; - sbio->err = err; + sbio->err = bio->bi_error; sbio->bio = bio; btrfs_queue_work(fs_info->scrub_workers, &sbio->work); @@ -2564,6 +2683,11 @@ static int scrub_extent_for_parity(struct scrub_parity *sparity, u8 csum[BTRFS_CSUM_SIZE]; u32 blocksize; + if (dev->missing) { + scrub_parity_mark_sectors_error(sparity, logical, len); + return 0; + } + if (flags & BTRFS_EXTENT_FLAG_DATA) { blocksize = sctx->sectorsize; } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { @@ -2672,11 +2796,11 @@ static void scrub_parity_bio_endio_worker(struct btrfs_work *work) scrub_pending_bio_dec(sctx); } -static void scrub_parity_bio_endio(struct bio *bio, int error) +static void scrub_parity_bio_endio(struct bio *bio) { struct scrub_parity *sparity = (struct scrub_parity *)bio->bi_private; - if (error) + if (bio->bi_error) bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap, sparity->nsectors); @@ -2702,7 +2826,7 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity) sparity->nsectors)) goto out; - length = sparity->logic_end - sparity->logic_start + 1; + length = sparity->logic_end - sparity->logic_start; ret = btrfs_map_sblock(sctx->dev_root->fs_info, WRITE, sparity->logic_start, &length, &bbio, 0, 1); @@ -2725,8 +2849,7 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity) goto rbio_out; list_for_each_entry(spage, &sparity->spages, list) - raid56_parity_add_scrub_pages(rbio, spage->page, - spage->logical); + raid56_add_scrub_pages(rbio, spage->page, spage->logical); scrub_pending_bio_inc(sctx); raid56_parity_submit_scrub_rbio(rbio); @@ -2774,6 +2897,7 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx, struct btrfs_root *root = fs_info->extent_root; struct btrfs_root *csum_root = fs_info->csum_root; struct btrfs_extent_item *extent; + struct btrfs_bio *bbio = NULL; u64 flags; int ret; int slot; @@ -2783,6 +2907,7 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx, u64 extent_logical; u64 extent_physical; u64 extent_len; + u64 mapped_length; struct btrfs_device *extent_dev; struct scrub_parity *sparity; int nsectors; @@ -2856,6 +2981,10 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx, } btrfs_item_key_to_cpu(l, &key, slot); + if (key.type != BTRFS_EXTENT_ITEM_KEY && + key.type != BTRFS_METADATA_ITEM_KEY) + goto next; + if (key.type == BTRFS_METADATA_ITEM_KEY) bytes = root->nodesize; else @@ -2864,11 +2993,7 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx, if (key.objectid + bytes <= logic_start) goto next; - if (key.type != BTRFS_EXTENT_ITEM_KEY && - key.type != BTRFS_METADATA_ITEM_KEY) - goto next; - - if (key.objectid > logic_end) { + if (key.objectid >= logic_end) { stop_loop = 1; break; } @@ -2881,11 +3006,12 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx, flags = btrfs_extent_flags(l, extent); generation = btrfs_extent_generation(l, extent); - if (key.objectid < logic_start && - (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) { - btrfs_err(fs_info, - "scrub: tree block %llu spanning stripes, ignored. logical=%llu", - key.objectid, logic_start); + if ((flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) && + (key.objectid < logic_start || + key.objectid + bytes > + logic_start + map->stripe_len)) { + btrfs_err(fs_info, "scrub: tree block %llu spanning stripes, ignored. logical=%llu", + key.objectid, logic_start); goto next; } again: @@ -2905,10 +3031,21 @@ again: scrub_parity_mark_sectors_data(sparity, extent_logical, extent_len); - scrub_remap_extent(fs_info, extent_logical, - extent_len, &extent_physical, - &extent_dev, - &extent_mirror_num); + mapped_length = extent_len; + ret = btrfs_map_block(fs_info, READ, extent_logical, + &mapped_length, &bbio, 0); + if (!ret) { + if (!bbio || mapped_length < extent_len) + ret = -EIO; + } + if (ret) { + btrfs_put_bbio(bbio); + goto out; + } + extent_physical = bbio->stripes[0].physical; + extent_mirror_num = bbio->mirror_num; + extent_dev = bbio->stripes[0].dev; + btrfs_put_bbio(bbio); ret = btrfs_lookup_csums_range(csum_root, extent_logical, @@ -2923,10 +3060,12 @@ again: extent_dev, flags, generation, extent_mirror_num); + + scrub_free_csums(sctx); + if (ret) goto out; - scrub_free_csums(sctx); if (extent_logical + extent_len < key.objectid + bytes) { logic_start += map->stripe_len; @@ -2955,7 +3094,7 @@ next: out: if (ret < 0) scrub_parity_mark_sectors_error(sparity, logic_start, - logic_end - logic_start + 1); + logic_end - logic_start); scrub_parity_put(sparity); scrub_submit(sctx); mutex_lock(&sctx->wr_ctx.wr_lock); @@ -3104,22 +3243,6 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, */ ret = 0; while (physical < physical_end) { - /* for raid56, we skip parity stripe */ - if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { - ret = get_raid56_logic_offset(physical, num, - map, &logical, &stripe_logical); - logical += base; - if (ret) { - stripe_logical += base; - stripe_end = stripe_logical + increment - 1; - ret = scrub_raid56_parity(sctx, map, scrub_dev, - ppath, stripe_logical, - stripe_end); - if (ret) - goto out; - goto skip; - } - } /* * canceled? */ @@ -3144,6 +3267,24 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, scrub_blocked_if_needed(fs_info); } + if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { + ret = get_raid56_logic_offset(physical, num, map, + &logical, + &stripe_logical); + logical += base; + if (ret) { + /* it is parity strip */ + stripe_logical += base; + stripe_end = stripe_logical + increment; + ret = scrub_raid56_parity(sctx, map, scrub_dev, + ppath, stripe_logical, + stripe_end); + if (ret) + goto out; + goto skip; + } + } + if (btrfs_fs_incompat(fs_info, SKINNY_METADATA)) key.type = BTRFS_METADATA_ITEM_KEY; else @@ -3188,6 +3329,10 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, } btrfs_item_key_to_cpu(l, &key, slot); + if (key.type != BTRFS_EXTENT_ITEM_KEY && + key.type != BTRFS_METADATA_ITEM_KEY) + goto next; + if (key.type == BTRFS_METADATA_ITEM_KEY) bytes = root->nodesize; else @@ -3196,10 +3341,6 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, if (key.objectid + bytes <= logical) goto next; - if (key.type != BTRFS_EXTENT_ITEM_KEY && - key.type != BTRFS_METADATA_ITEM_KEY) - goto next; - if (key.objectid >= logical + map->stripe_len) { /* out of this device extent */ if (key.objectid >= logic_end) @@ -3212,8 +3353,10 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, flags = btrfs_extent_flags(l, extent); generation = btrfs_extent_generation(l, extent); - if (key.objectid < logical && - (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) { + if ((flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) && + (key.objectid < logical || + key.objectid + bytes > + logical + map->stripe_len)) { btrfs_err(fs_info, "scrub: tree block %llu spanning " "stripes, ignored. logical=%llu", @@ -3247,9 +3390,11 @@ again: &extent_dev, &extent_mirror_num); - ret = btrfs_lookup_csums_range(csum_root, logical, - logical + map->stripe_len - 1, - &sctx->csum_list, 1); + ret = btrfs_lookup_csums_range(csum_root, + extent_logical, + extent_logical + + extent_len - 1, + &sctx->csum_list, 1); if (ret) goto out; @@ -3257,10 +3402,12 @@ again: extent_physical, extent_dev, flags, generation, extent_mirror_num, extent_logical - logical + physical); + + scrub_free_csums(sctx); + if (ret) goto out; - scrub_free_csums(sctx); if (extent_logical + extent_len < key.objectid + bytes) { if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { @@ -3278,7 +3425,7 @@ loop: if (ret && physical < physical_end) { stripe_logical += base; stripe_end = stripe_logical + - increment - 1; + increment; ret = scrub_raid56_parity(sctx, map, scrub_dev, ppath, stripe_logical, @@ -3333,7 +3480,6 @@ out: static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx, struct btrfs_device *scrub_dev, - u64 chunk_tree, u64 chunk_objectid, u64 chunk_offset, u64 length, u64 dev_offset, int is_dev_replace) { @@ -3384,10 +3530,8 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, struct btrfs_root *root = sctx->dev_root; struct btrfs_fs_info *fs_info = root->fs_info; u64 length; - u64 chunk_tree; - u64 chunk_objectid; u64 chunk_offset; - int ret; + int ret = 0; int slot; struct extent_buffer *l; struct btrfs_key key; @@ -3415,8 +3559,14 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { ret = btrfs_next_leaf(root, path); - if (ret) + if (ret < 0) + break; + if (ret > 0) { + ret = 0; break; + } + } else { + ret = 0; } } @@ -3443,8 +3593,6 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, if (found_key.offset + length <= start) goto skip; - chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent); - chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent); chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); /* @@ -3458,12 +3606,27 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, if (!cache) goto skip; + /* + * we need call btrfs_inc_block_group_ro() with scrubs_paused, + * to avoid deadlock caused by: + * btrfs_inc_block_group_ro() + * -> btrfs_wait_for_commit() + * -> btrfs_commit_transaction() + * -> btrfs_scrub_pause() + */ + scrub_pause_on(fs_info); + ret = btrfs_inc_block_group_ro(root, cache); + scrub_pause_off(fs_info); + if (ret) { + btrfs_put_block_group(cache); + break; + } + dev_replace->cursor_right = found_key.offset + length; dev_replace->cursor_left = found_key.offset; dev_replace->item_needs_writeback = 1; - ret = scrub_chunk(sctx, scrub_dev, chunk_tree, chunk_objectid, - chunk_offset, length, found_key.offset, - is_dev_replace); + ret = scrub_chunk(sctx, scrub_dev, chunk_offset, length, + found_key.offset, is_dev_replace); /* * flush, submit all pending read and write bios, afterwards @@ -3483,8 +3646,8 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); - atomic_inc(&fs_info->scrubs_paused); - wake_up(&fs_info->scrub_pause_wait); + + scrub_pause_on(fs_info); /* * must be called before we decrease @scrub_paused. @@ -3495,11 +3658,9 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, atomic_read(&sctx->workers_pending) == 0); atomic_set(&sctx->wr_ctx.flush_all_writes, 0); - mutex_lock(&fs_info->scrub_lock); - __scrub_blocked_if_needed(fs_info); - atomic_dec(&fs_info->scrubs_paused); - mutex_unlock(&fs_info->scrub_lock); - wake_up(&fs_info->scrub_pause_wait); + scrub_pause_off(fs_info); + + btrfs_dec_block_group_ro(root, cache); btrfs_put_block_group(cache); if (ret) @@ -3523,11 +3684,7 @@ skip: btrfs_free_path(path); - /* - * ret can still be 1 from search_slot or next_leaf, - * that's not an error - */ - return ret < 0 ? ret : 0; + return ret; } static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx, @@ -3896,8 +4053,7 @@ static int scrub_setup_wr_ctx(struct scrub_ctx *sctx, return 0; WARN_ON(!dev->bdev); - wr_ctx->pages_per_wr_bio = min_t(int, SCRUB_PAGES_PER_WR_BIO, - bio_get_nr_vecs(dev->bdev)); + wr_ctx->pages_per_wr_bio = SCRUB_PAGES_PER_WR_BIO; wr_ctx->tgtdev = dev; atomic_set(&wr_ctx->flush_all_writes, 0); return 0; @@ -4219,8 +4375,8 @@ static int write_page_nocow(struct scrub_ctx *sctx, if (!dev) return -EIO; if (!dev->bdev) { - printk_ratelimited(KERN_WARNING - "BTRFS: scrub write_page_nocow(bdev == NULL) is unexpected!\n"); + btrfs_warn_rl(dev->dev_root->fs_info, + "scrub write_page_nocow(bdev == NULL) is unexpected"); return -EIO; } bio = btrfs_io_bio_alloc(GFP_NOFS, 1); diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index aa72bfd28f7d..355a458cba1a 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1434,16 +1434,6 @@ verbose_printk(KERN_DEBUG "btrfs: find_extent_clone: data_offset=%llu, " } if (cur_clone_root) { - if (compressed != BTRFS_COMPRESS_NONE) { - /* - * Offsets given by iterate_extent_inodes() are relative - * to the start of the extent, we need to add logical - * offset from the file extent item. - * (See why at backref.c:check_extent_in_eb()) - */ - cur_clone_root->offset += btrfs_file_extent_offset(eb, - fi); - } *found = cur_clone_root; ret = 0; } else { @@ -1920,10 +1910,12 @@ static int did_overwrite_ref(struct send_ctx *sctx, /* * We know that it is or will be overwritten. Check this now. * The current inode being processed might have been the one that caused - * inode 'ino' to be orphanized, therefore ow_inode can actually be the - * same as sctx->send_progress. + * inode 'ino' to be orphanized, therefore check if ow_inode matches + * the current inode being processed. */ - if (ow_inode <= sctx->send_progress) + if ((ow_inode < sctx->send_progress) || + (ino != sctx->cur_ino && ow_inode == sctx->cur_ino && + gen == sctx->cur_inode_gen)) ret = 1; else ret = 0; @@ -2351,8 +2343,14 @@ static int send_subvol_begin(struct send_ctx *sctx) } TLV_PUT_STRING(sctx, BTRFS_SEND_A_PATH, name, namelen); - TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID, - sctx->send_root->root_item.uuid); + + if (!btrfs_is_empty_uuid(sctx->send_root->root_item.received_uuid)) + TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID, + sctx->send_root->root_item.received_uuid); + else + TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID, + sctx->send_root->root_item.uuid); + TLV_PUT_U64(sctx, BTRFS_SEND_A_CTRANSID, le64_to_cpu(sctx->send_root->root_item.ctransid)); if (parent_root) { @@ -2562,7 +2560,7 @@ verbose_printk("btrfs: send_create_inode %llu\n", ino); } else if (S_ISSOCK(mode)) { cmd = BTRFS_SEND_C_MKSOCK; } else { - printk(KERN_WARNING "btrfs: unexpected inode type %o", + btrfs_warn(sctx->send_root->fs_info, "unexpected inode type %o", (int)(mode & S_IFMT)); ret = -ENOTSUPP; goto out; @@ -4685,6 +4683,171 @@ tlv_put_failure: return ret; } +static int send_extent_data(struct send_ctx *sctx, + const u64 offset, + const u64 len) +{ + u64 sent = 0; + + if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA) + return send_update_extent(sctx, offset, len); + + while (sent < len) { + u64 size = len - sent; + int ret; + + if (size > BTRFS_SEND_READ_SIZE) + size = BTRFS_SEND_READ_SIZE; + ret = send_write(sctx, offset + sent, size); + if (ret < 0) + return ret; + if (!ret) + break; + sent += ret; + } + return 0; +} + +static int clone_range(struct send_ctx *sctx, + struct clone_root *clone_root, + const u64 disk_byte, + u64 data_offset, + u64 offset, + u64 len) +{ + struct btrfs_path *path; + struct btrfs_key key; + int ret; + + path = alloc_path_for_send(); + if (!path) + return -ENOMEM; + + /* + * We can't send a clone operation for the entire range if we find + * extent items in the respective range in the source file that + * refer to different extents or if we find holes. + * So check for that and do a mix of clone and regular write/copy + * operations if needed. + * + * Example: + * + * mkfs.btrfs -f /dev/sda + * mount /dev/sda /mnt + * xfs_io -f -c "pwrite -S 0xaa 0K 100K" /mnt/foo + * cp --reflink=always /mnt/foo /mnt/bar + * xfs_io -c "pwrite -S 0xbb 50K 50K" /mnt/foo + * btrfs subvolume snapshot -r /mnt /mnt/snap + * + * If when we send the snapshot and we are processing file bar (which + * has a higher inode number than foo) we blindly send a clone operation + * for the [0, 100K[ range from foo to bar, the receiver ends up getting + * a file bar that matches the content of file foo - iow, doesn't match + * the content from bar in the original filesystem. + */ + key.objectid = clone_root->ino; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = clone_root->offset; + ret = btrfs_search_slot(NULL, clone_root->root, &key, path, 0, 0); + if (ret < 0) + goto out; + if (ret > 0 && path->slots[0] > 0) { + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1); + if (key.objectid == clone_root->ino && + key.type == BTRFS_EXTENT_DATA_KEY) + path->slots[0]--; + } + + while (true) { + struct extent_buffer *leaf = path->nodes[0]; + int slot = path->slots[0]; + struct btrfs_file_extent_item *ei; + u8 type; + u64 ext_len; + u64 clone_len; + + if (slot >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(clone_root->root, path); + if (ret < 0) + goto out; + else if (ret > 0) + break; + continue; + } + + btrfs_item_key_to_cpu(leaf, &key, slot); + + /* + * We might have an implicit trailing hole (NO_HOLES feature + * enabled). We deal with it after leaving this loop. + */ + if (key.objectid != clone_root->ino || + key.type != BTRFS_EXTENT_DATA_KEY) + break; + + ei = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); + type = btrfs_file_extent_type(leaf, ei); + if (type == BTRFS_FILE_EXTENT_INLINE) { + ext_len = btrfs_file_extent_inline_len(leaf, slot, ei); + ext_len = PAGE_CACHE_ALIGN(ext_len); + } else { + ext_len = btrfs_file_extent_num_bytes(leaf, ei); + } + + if (key.offset + ext_len <= clone_root->offset) + goto next; + + if (key.offset > clone_root->offset) { + /* Implicit hole, NO_HOLES feature enabled. */ + u64 hole_len = key.offset - clone_root->offset; + + if (hole_len > len) + hole_len = len; + ret = send_extent_data(sctx, offset, hole_len); + if (ret < 0) + goto out; + + len -= hole_len; + if (len == 0) + break; + offset += hole_len; + clone_root->offset += hole_len; + data_offset += hole_len; + } + + if (key.offset >= clone_root->offset + len) + break; + + clone_len = min_t(u64, ext_len, len); + + if (btrfs_file_extent_disk_bytenr(leaf, ei) == disk_byte && + btrfs_file_extent_offset(leaf, ei) == data_offset) + ret = send_clone(sctx, offset, clone_len, clone_root); + else + ret = send_extent_data(sctx, offset, clone_len); + + if (ret < 0) + goto out; + + len -= clone_len; + if (len == 0) + break; + offset += clone_len; + clone_root->offset += clone_len; + data_offset += clone_len; +next: + path->slots[0]++; + } + + if (len > 0) + ret = send_extent_data(sctx, offset, len); + else + ret = 0; +out: + btrfs_free_path(path); + return ret; +} + static int send_write_or_clone(struct send_ctx *sctx, struct btrfs_path *path, struct btrfs_key *key, @@ -4693,9 +4856,7 @@ static int send_write_or_clone(struct send_ctx *sctx, int ret = 0; struct btrfs_file_extent_item *ei; u64 offset = key->offset; - u64 pos = 0; u64 len; - u32 l; u8 type; u64 bs = sctx->send_root->fs_info->sb->s_blocksize; @@ -4723,22 +4884,15 @@ static int send_write_or_clone(struct send_ctx *sctx, } if (clone_root && IS_ALIGNED(offset + len, bs)) { - ret = send_clone(sctx, offset, len, clone_root); - } else if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA) { - ret = send_update_extent(sctx, offset, len); + u64 disk_byte; + u64 data_offset; + + disk_byte = btrfs_file_extent_disk_bytenr(path->nodes[0], ei); + data_offset = btrfs_file_extent_offset(path->nodes[0], ei); + ret = clone_range(sctx, clone_root, disk_byte, data_offset, + offset, len); } else { - while (pos < len) { - l = len - pos; - if (l > BTRFS_SEND_READ_SIZE) - l = BTRFS_SEND_READ_SIZE; - ret = send_write(sctx, pos + offset, l); - if (ret < 0) - goto out; - if (!ret) - break; - pos += ret; - } - ret = 0; + ret = send_extent_data(sctx, offset, len); } out: return ret; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 6bad63379a4c..24154e422945 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -69,7 +69,7 @@ static struct file_system_type btrfs_fs_type; static int btrfs_remount(struct super_block *sb, int *flags, char *data); -static const char *btrfs_decode_error(int errno) +const char *btrfs_decode_error(int errno) { char *errstr = "unknown"; @@ -130,7 +130,6 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info) } } -#ifdef CONFIG_PRINTK /* * __btrfs_std_error decodes expected errors from the caller and * invokes the approciate error response. @@ -140,7 +139,9 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, unsigned int line, int errno, const char *fmt, ...) { struct super_block *sb = fs_info->sb; +#ifdef CONFIG_PRINTK const char *errstr; +#endif /* * Special case: if the error is EROFS, and we're already @@ -149,6 +150,7 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, if (errno == -EROFS && (sb->s_flags & MS_RDONLY)) return; +#ifdef CONFIG_PRINTK errstr = btrfs_decode_error(errno); if (fmt) { struct va_format vaf; @@ -166,6 +168,7 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, printk(KERN_CRIT "BTRFS: error (device %s) in %s:%d: errno=%d %s\n", sb->s_id, function, line, errno, errstr); } +#endif /* Don't go through full error handling during mount */ save_error_info(fs_info); @@ -173,6 +176,7 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, btrfs_handle_error(fs_info); } +#ifdef CONFIG_PRINTK static const char * const logtypes[] = { "emergency", "alert", @@ -212,27 +216,6 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) va_end(args); } - -#else - -void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, - unsigned int line, int errno, const char *fmt, ...) -{ - struct super_block *sb = fs_info->sb; - - /* - * Special case: if the error is EROFS, and we're already - * under MS_RDONLY, then it is safe here. - */ - if (errno == -EROFS && (sb->s_flags & MS_RDONLY)) - return; - - /* Don't go through full error handling during mount */ - if (sb->s_flags & MS_BORN) { - save_error_info(fs_info); - btrfs_handle_error(fs_info); - } -} #endif /* @@ -320,6 +303,9 @@ enum { Opt_commit_interval, Opt_barrier, Opt_nodefrag, Opt_nodiscard, Opt_noenospc_debug, Opt_noflushoncommit, Opt_acl, Opt_datacow, Opt_datasum, Opt_treelog, Opt_noinode_cache, +#ifdef CONFIG_BTRFS_DEBUG + Opt_fragment_data, Opt_fragment_metadata, Opt_fragment_all, +#endif Opt_err, }; @@ -372,6 +358,11 @@ static match_table_t tokens = { {Opt_rescan_uuid_tree, "rescan_uuid_tree"}, {Opt_fatal_errors, "fatal_errors=%s"}, {Opt_commit_interval, "commit=%d"}, +#ifdef CONFIG_BTRFS_DEBUG + {Opt_fragment_data, "fragment=data"}, + {Opt_fragment_metadata, "fragment=metadata"}, + {Opt_fragment_all, "fragment=all"}, +#endif {Opt_err, NULL}, }; @@ -738,6 +729,22 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL; } break; +#ifdef CONFIG_BTRFS_DEBUG + case Opt_fragment_all: + btrfs_info(root->fs_info, "fragmenting all space"); + btrfs_set_opt(info->mount_opt, FRAGMENT_DATA); + btrfs_set_opt(info->mount_opt, FRAGMENT_METADATA); + break; + case Opt_fragment_metadata: + btrfs_info(root->fs_info, "fragmenting metadata"); + btrfs_set_opt(info->mount_opt, + FRAGMENT_METADATA); + break; + case Opt_fragment_data: + btrfs_info(root->fs_info, "fragmenting data"); + btrfs_set_opt(info->mount_opt, FRAGMENT_DATA); + break; +#endif case Opt_err: btrfs_info(root->fs_info, "unrecognized mount option '%s'", p); ret = -EINVAL; @@ -1033,6 +1040,7 @@ static int btrfs_fill_super(struct super_block *sb, sb->s_flags |= MS_POSIXACL; #endif sb->s_flags |= MS_I_VERSION; + sb->s_iflags |= SB_I_CGROUPWB; err = open_ctree(sb, fs_devices, (char *)data); if (err) { printk(KERN_ERR "BTRFS: open_ctree failed\n"); @@ -1188,6 +1196,12 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) seq_puts(seq, ",fatal_errors=panic"); if (info->commit_interval != BTRFS_DEFAULT_COMMIT_INTERVAL) seq_printf(seq, ",commit=%d", info->commit_interval); +#ifdef CONFIG_BTRFS_DEBUG + if (btrfs_test_opt(root, FRAGMENT_DATA)) + seq_puts(seq, ",fragment=data"); + if (btrfs_test_opt(root, FRAGMENT_METADATA)) + seq_puts(seq, ",fragment=metadata"); +#endif seq_printf(seq, ",subvolid=%llu", BTRFS_I(d_inode(dentry))->root->root_key.objectid); seq_puts(seq, ",subvol="); @@ -1650,6 +1664,15 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) sb->s_flags |= MS_RDONLY; + /* + * Setting MS_RDONLY will put the cleaner thread to + * sleep at the next loop if it's already active. + * If it's already asleep, we'll leave unused block + * groups on disk until we're mounted read-write again + * unless we clean them up here. + */ + btrfs_delete_unused_bgs(fs_info); + btrfs_dev_replace_suspend_for_unmount(fs_info); btrfs_scrub_cancel(fs_info); btrfs_pause_balance(fs_info); diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 603b0cc2b9bb..e0ac85949067 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -437,24 +437,24 @@ static const struct attribute *btrfs_attrs[] = { NULL, }; -static void btrfs_release_super_kobj(struct kobject *kobj) +static void btrfs_release_fsid_kobj(struct kobject *kobj) { struct btrfs_fs_devices *fs_devs = to_fs_devs(kobj); - memset(&fs_devs->super_kobj, 0, sizeof(struct kobject)); + memset(&fs_devs->fsid_kobj, 0, sizeof(struct kobject)); complete(&fs_devs->kobj_unregister); } static struct kobj_type btrfs_ktype = { .sysfs_ops = &kobj_sysfs_ops, - .release = btrfs_release_super_kobj, + .release = btrfs_release_fsid_kobj, }; static inline struct btrfs_fs_devices *to_fs_devs(struct kobject *kobj) { if (kobj->ktype != &btrfs_ktype) return NULL; - return container_of(kobj, struct btrfs_fs_devices, super_kobj); + return container_of(kobj, struct btrfs_fs_devices, fsid_kobj); } static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj) @@ -502,12 +502,12 @@ static int addrm_unknown_feature_attrs(struct btrfs_fs_info *fs_info, bool add) attrs[0] = &fa->kobj_attr.attr; if (add) { int ret; - ret = sysfs_merge_group(&fs_info->fs_devices->super_kobj, + ret = sysfs_merge_group(&fs_info->fs_devices->fsid_kobj, &agroup); if (ret) return ret; } else - sysfs_unmerge_group(&fs_info->fs_devices->super_kobj, + sysfs_unmerge_group(&fs_info->fs_devices->fsid_kobj, &agroup); } @@ -523,9 +523,9 @@ static void __btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs) fs_devs->device_dir_kobj = NULL; } - if (fs_devs->super_kobj.state_initialized) { - kobject_del(&fs_devs->super_kobj); - kobject_put(&fs_devs->super_kobj); + if (fs_devs->fsid_kobj.state_initialized) { + kobject_del(&fs_devs->fsid_kobj); + kobject_put(&fs_devs->fsid_kobj); wait_for_completion(&fs_devs->kobj_unregister); } } @@ -545,7 +545,7 @@ void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs) } } -void btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info) +void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info) { btrfs_reset_fs_info_ptr(fs_info); @@ -555,9 +555,9 @@ void btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info) kobject_put(fs_info->space_info_kobj); } addrm_unknown_feature_attrs(fs_info, false); - sysfs_remove_group(&fs_info->fs_devices->super_kobj, &btrfs_feature_attr_group); - sysfs_remove_files(&fs_info->fs_devices->super_kobj, btrfs_attrs); - btrfs_kobj_rm_device(fs_info->fs_devices, NULL); + sysfs_remove_group(&fs_info->fs_devices->fsid_kobj, &btrfs_feature_attr_group); + sysfs_remove_files(&fs_info->fs_devices->fsid_kobj, btrfs_attrs); + btrfs_sysfs_rm_device_link(fs_info->fs_devices, NULL); } const char * const btrfs_feature_set_names[3] = { @@ -637,7 +637,7 @@ static void init_feature_attrs(void) /* when one_device is NULL, it removes all device links */ -int btrfs_kobj_rm_device(struct btrfs_fs_devices *fs_devices, +int btrfs_sysfs_rm_device_link(struct btrfs_fs_devices *fs_devices, struct btrfs_device *one_device) { struct hd_struct *disk; @@ -675,7 +675,7 @@ int btrfs_sysfs_add_device(struct btrfs_fs_devices *fs_devs) { if (!fs_devs->device_dir_kobj) fs_devs->device_dir_kobj = kobject_create_and_add("devices", - &fs_devs->super_kobj); + &fs_devs->fsid_kobj); if (!fs_devs->device_dir_kobj) return -ENOMEM; @@ -683,7 +683,7 @@ int btrfs_sysfs_add_device(struct btrfs_fs_devices *fs_devs) return 0; } -int btrfs_kobj_add_device(struct btrfs_fs_devices *fs_devices, +int btrfs_sysfs_add_device_link(struct btrfs_fs_devices *fs_devices, struct btrfs_device *one_device) { int error = 0; @@ -730,31 +730,31 @@ int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs, int error; init_completion(&fs_devs->kobj_unregister); - fs_devs->super_kobj.kset = btrfs_kset; - error = kobject_init_and_add(&fs_devs->super_kobj, + fs_devs->fsid_kobj.kset = btrfs_kset; + error = kobject_init_and_add(&fs_devs->fsid_kobj, &btrfs_ktype, parent, "%pU", fs_devs->fsid); return error; } -int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info) +int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info) { int error; struct btrfs_fs_devices *fs_devs = fs_info->fs_devices; - struct kobject *super_kobj = &fs_devs->super_kobj; + struct kobject *fsid_kobj = &fs_devs->fsid_kobj; btrfs_set_fs_info_ptr(fs_info); - error = btrfs_kobj_add_device(fs_devs, NULL); + error = btrfs_sysfs_add_device_link(fs_devs, NULL); if (error) return error; - error = sysfs_create_files(super_kobj, btrfs_attrs); + error = sysfs_create_files(fsid_kobj, btrfs_attrs); if (error) { - btrfs_kobj_rm_device(fs_devs, NULL); + btrfs_sysfs_rm_device_link(fs_devs, NULL); return error; } - error = sysfs_create_group(super_kobj, + error = sysfs_create_group(fsid_kobj, &btrfs_feature_attr_group); if (error) goto failure; @@ -764,7 +764,7 @@ int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info) goto failure; fs_info->space_info_kobj = kobject_create_and_add("allocation", - super_kobj); + fsid_kobj); if (!fs_info->space_info_kobj) { error = -ENOMEM; goto failure; @@ -776,7 +776,7 @@ int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info) return 0; failure: - btrfs_sysfs_remove_one(fs_info); + btrfs_sysfs_remove_mounted(fs_info); return error; } diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index 6392527bcc15..9c09522125a6 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -82,9 +82,9 @@ char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags); extern const char * const btrfs_feature_set_names[3]; extern struct kobj_type space_info_ktype; extern struct kobj_type btrfs_raid_ktype; -int btrfs_kobj_add_device(struct btrfs_fs_devices *fs_devices, +int btrfs_sysfs_add_device_link(struct btrfs_fs_devices *fs_devices, struct btrfs_device *one_device); -int btrfs_kobj_rm_device(struct btrfs_fs_devices *fs_devices, +int btrfs_sysfs_rm_device_link(struct btrfs_fs_devices *fs_devices, struct btrfs_device *one_device); int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs, struct kobject *parent); diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c index 2299bfde39ee..c8c3d70c31ff 100644 --- a/fs/btrfs/tests/free-space-tests.c +++ b/fs/btrfs/tests/free-space-tests.c @@ -19,6 +19,7 @@ #include <linux/slab.h> #include "btrfs-tests.h" #include "../ctree.h" +#include "../disk-io.h" #include "../free-space-cache.h" #define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8) @@ -35,6 +36,12 @@ static struct btrfs_block_group_cache *init_test_block_group(void) kfree(cache); return NULL; } + cache->fs_info = btrfs_alloc_dummy_fs_info(); + if (!cache->fs_info) { + kfree(cache->free_space_ctl); + kfree(cache); + return NULL; + } cache->key.objectid = 0; cache->key.offset = 1024 * 1024 * 1024; @@ -879,7 +886,8 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) int btrfs_test_free_space_cache(void) { struct btrfs_block_group_cache *cache; - int ret; + struct btrfs_root *root = NULL; + int ret = -ENOMEM; test_msg("Running btrfs free space cache tests\n"); @@ -889,6 +897,17 @@ int btrfs_test_free_space_cache(void) return 0; } + root = btrfs_alloc_dummy_root(); + if (!root) + goto out; + + root->fs_info = btrfs_alloc_dummy_fs_info(); + if (!root->fs_info) + goto out; + + root->fs_info->extent_root = root; + cache->fs_info = root->fs_info; + ret = test_extents(cache); if (ret) goto out; @@ -904,6 +923,7 @@ out: __btrfs_remove_free_space_cache(cache->free_space_ctl); kfree(cache->free_space_ctl); kfree(cache); + btrfs_free_dummy_root(root); test_msg("Free space cache tests finished\n"); return ret; } diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index f5021fcb154e..418c6a2ad7d8 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -82,6 +82,12 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction) static void clear_btree_io_tree(struct extent_io_tree *tree) { spin_lock(&tree->lock); + /* + * Do a single barrier for the waitqueue_active check here, the state + * of the waitqueue should not change once clear_btree_io_tree is + * called. + */ + smp_mb(); while (!RB_EMPTY_ROOT(&tree->state)) { struct rb_node *node; struct extent_state *state; @@ -117,6 +123,18 @@ static noinline void switch_commit_roots(struct btrfs_transaction *trans, btrfs_unpin_free_ino(root); clear_btree_io_tree(&root->dirty_log_pages); } + + /* We can free old roots now. */ + spin_lock(&trans->dropped_roots_lock); + while (!list_empty(&trans->dropped_roots)) { + root = list_first_entry(&trans->dropped_roots, + struct btrfs_root, root_list); + list_del_init(&root->root_list); + spin_unlock(&trans->dropped_roots_lock); + btrfs_drop_and_free_fs_root(fs_info, root); + spin_lock(&trans->dropped_roots_lock); + } + spin_unlock(&trans->dropped_roots_lock); up_write(&fs_info->commit_root_sem); } @@ -214,25 +232,22 @@ loop: extwriter_counter_init(cur_trans, type); init_waitqueue_head(&cur_trans->writer_wait); init_waitqueue_head(&cur_trans->commit_wait); + init_waitqueue_head(&cur_trans->pending_wait); cur_trans->state = TRANS_STATE_RUNNING; /* * One for this trans handle, one so it will live on until we * commit the transaction. */ atomic_set(&cur_trans->use_count, 2); - cur_trans->have_free_bgs = 0; + atomic_set(&cur_trans->pending_ordered, 0); + cur_trans->flags = 0; cur_trans->start_time = get_seconds(); - cur_trans->dirty_bg_run = 0; + + memset(&cur_trans->delayed_refs, 0, sizeof(cur_trans->delayed_refs)); cur_trans->delayed_refs.href_root = RB_ROOT; cur_trans->delayed_refs.dirty_extent_root = RB_ROOT; atomic_set(&cur_trans->delayed_refs.num_entries, 0); - cur_trans->delayed_refs.num_heads_ready = 0; - cur_trans->delayed_refs.pending_csums = 0; - cur_trans->delayed_refs.num_heads = 0; - cur_trans->delayed_refs.flushing = 0; - cur_trans->delayed_refs.run_delayed_start = 0; - cur_trans->delayed_refs.qgroup_to_skip = 0; /* * although the tree mod log is per file system and not per transaction, @@ -252,12 +267,15 @@ loop: INIT_LIST_HEAD(&cur_trans->pending_snapshots); INIT_LIST_HEAD(&cur_trans->pending_chunks); INIT_LIST_HEAD(&cur_trans->switch_commits); - INIT_LIST_HEAD(&cur_trans->pending_ordered); INIT_LIST_HEAD(&cur_trans->dirty_bgs); INIT_LIST_HEAD(&cur_trans->io_bgs); + INIT_LIST_HEAD(&cur_trans->dropped_roots); mutex_init(&cur_trans->cache_write_mutex); cur_trans->num_dirty_bgs = 0; spin_lock_init(&cur_trans->dirty_bgs_lock); + INIT_LIST_HEAD(&cur_trans->deleted_bgs); + spin_lock_init(&cur_trans->deleted_bgs_lock); + spin_lock_init(&cur_trans->dropped_roots_lock); list_add_tail(&cur_trans->list, &fs_info->trans_list); extent_io_tree_init(&cur_trans->dirty_pages, fs_info->btree_inode->i_mapping); @@ -334,6 +352,24 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans, } +void btrfs_add_dropped_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_transaction *cur_trans = trans->transaction; + + /* Add ourselves to the transaction dropped list */ + spin_lock(&cur_trans->dropped_roots_lock); + list_add_tail(&root->root_list, &cur_trans->dropped_roots); + spin_unlock(&cur_trans->dropped_roots_lock); + + /* Make sure we don't try to update the root at commit time */ + spin_lock(&root->fs_info->fs_roots_radix_lock); + radix_tree_tag_clear(&root->fs_info->fs_roots_radix, + (unsigned long)root->root_key.objectid, + BTRFS_ROOT_TRANS_TAG); + spin_unlock(&root->fs_info->fs_roots_radix_lock); +} + int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, struct btrfs_root *root) { @@ -413,8 +449,8 @@ static inline bool need_reserve_reloc_root(struct btrfs_root *root) } static struct btrfs_trans_handle * -start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type, - enum btrfs_reserve_flush_enum flush) +start_transaction(struct btrfs_root *root, unsigned int num_items, + unsigned int type, enum btrfs_reserve_flush_enum flush) { struct btrfs_trans_handle *h; struct btrfs_transaction *cur_trans; @@ -444,13 +480,10 @@ start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type, * the appropriate flushing if need be. */ if (num_items > 0 && root != root->fs_info->chunk_root) { - if (root->fs_info->quota_enabled && - is_fstree(root->root_key.objectid)) { - qgroup_reserved = num_items * root->nodesize; - ret = btrfs_qgroup_reserve(root, qgroup_reserved); - if (ret) - return ERR_PTR(ret); - } + qgroup_reserved = num_items * root->nodesize; + ret = btrfs_qgroup_reserve_meta(root, qgroup_reserved); + if (ret) + return ERR_PTR(ret); num_bytes = btrfs_calc_trans_metadata_size(root, num_items); /* @@ -468,7 +501,7 @@ start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type, goto reserve_fail; } again: - h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS); + h = kmem_cache_zalloc(btrfs_trans_handle_cachep, GFP_NOFS); if (!h) { ret = -ENOMEM; goto alloc_fail; @@ -509,25 +542,13 @@ again: h->transid = cur_trans->transid; h->transaction = cur_trans; - h->blocks_used = 0; - h->bytes_reserved = 0; - h->chunk_bytes_reserved = 0; h->root = root; - h->delayed_ref_updates = 0; h->use_count = 1; - h->adding_csums = 0; - h->block_rsv = NULL; - h->orig_rsv = NULL; - h->aborted = 0; - h->qgroup_reserved = 0; - h->delayed_ref_elem.seq = 0; + h->type = type; - h->allocating_chunk = false; - h->reloc_reserved = false; - h->sync = false; + h->can_flush_pending_bgs = true; INIT_LIST_HEAD(&h->qgroup_ref_list); INIT_LIST_HEAD(&h->new_bgs); - INIT_LIST_HEAD(&h->ordered); smp_mb(); if (cur_trans->state >= TRANS_STATE_BLOCKED && @@ -544,7 +565,6 @@ again: h->bytes_reserved = num_bytes; h->reloc_reserved = reloc_reserved; } - h->qgroup_reserved = qgroup_reserved; got_it: btrfs_record_root_in_trans(h, root); @@ -562,20 +582,20 @@ alloc_fail: btrfs_block_rsv_release(root, &root->fs_info->trans_block_rsv, num_bytes); reserve_fail: - if (qgroup_reserved) - btrfs_qgroup_free(root, qgroup_reserved); + btrfs_qgroup_free_meta(root, qgroup_reserved); return ERR_PTR(ret); } struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, - int num_items) + unsigned int num_items) { return start_transaction(root, num_items, TRANS_START, BTRFS_RESERVE_FLUSH_ALL); } struct btrfs_trans_handle *btrfs_start_transaction_lflush( - struct btrfs_root *root, int num_items) + struct btrfs_root *root, + unsigned int num_items) { return start_transaction(root, num_items, TRANS_START, BTRFS_RESERVE_FLUSH_LIMIT); @@ -759,12 +779,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, if (!list_empty(&trans->new_bgs)) btrfs_create_pending_block_groups(trans, root); - if (!list_empty(&trans->ordered)) { - spin_lock(&info->trans_lock); - list_splice_init(&trans->ordered, &cur_trans->pending_ordered); - spin_unlock(&info->trans_lock); - } - trans->delayed_ref_updates = 0; if (!trans->sync) { must_run_delayed_refs = @@ -780,15 +794,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, must_run_delayed_refs = 2; } - if (trans->qgroup_reserved) { - /* - * the same root has to be passed here between start_transaction - * and end_transaction. Subvolume quota depends on this. - */ - btrfs_qgroup_free(trans->root, trans->qgroup_reserved); - trans->qgroup_reserved = 0; - } - btrfs_trans_release_metadata(trans, root); trans->block_rsv = NULL; @@ -821,6 +826,9 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, atomic_dec(&cur_trans->num_writers); extwriter_counter_dec(cur_trans, trans->type); + /* + * Make sure counter is updated before we wake up waiters. + */ smp_mb(); if (waitqueue_active(&cur_trans->writer_wait)) wake_up(&cur_trans->writer_wait); @@ -1203,6 +1211,7 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans, spin_lock(&fs_info->fs_roots_radix_lock); if (err) break; + btrfs_qgroup_free_meta_all(root); } } spin_unlock(&fs_info->fs_roots_radix_lock); @@ -1301,7 +1310,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, */ btrfs_set_skip_qgroup(trans, objectid); - btrfs_reloc_pre_snapshot(trans, pending, &to_reserve); + btrfs_reloc_pre_snapshot(pending, &to_reserve); if (to_reserve > 0) { pending->error = btrfs_block_rsv_add(root, @@ -1638,9 +1647,7 @@ static void do_async_commit(struct work_struct *work) * Tell lockdep about it. */ if (ac->newtrans->type & __TRANS_FREEZABLE) - rwsem_acquire_read( - &ac->root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1], - 0, 1, _THIS_IP_); + __sb_writers_acquired(ac->root->fs_info->sb, SB_FREEZE_FS); current->journal_info = ac->newtrans; @@ -1679,9 +1686,7 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, * async commit thread will be the one to unlock it. */ if (ac->newtrans->type & __TRANS_FREEZABLE) - rwsem_release( - &root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1], - 1, _THIS_IP_); + __sb_writers_release(root->fs_info->sb, SB_FREEZE_FS); schedule_work(&ac->work); @@ -1764,25 +1769,10 @@ static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info) } static inline void -btrfs_wait_pending_ordered(struct btrfs_transaction *cur_trans, - struct btrfs_fs_info *fs_info) +btrfs_wait_pending_ordered(struct btrfs_transaction *cur_trans) { - struct btrfs_ordered_extent *ordered; - - spin_lock(&fs_info->trans_lock); - while (!list_empty(&cur_trans->pending_ordered)) { - ordered = list_first_entry(&cur_trans->pending_ordered, - struct btrfs_ordered_extent, - trans_list); - list_del_init(&ordered->trans_list); - spin_unlock(&fs_info->trans_lock); - - wait_event(ordered->wait, test_bit(BTRFS_ORDERED_COMPLETE, - &ordered->flags)); - btrfs_put_ordered_extent(ordered); - spin_lock(&fs_info->trans_lock); - } - spin_unlock(&fs_info->trans_lock); + wait_event(cur_trans->pending_wait, + atomic_read(&cur_trans->pending_ordered) == 0); } int btrfs_commit_transaction(struct btrfs_trans_handle *trans, @@ -1811,10 +1801,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, btrfs_trans_release_metadata(trans, root); trans->block_rsv = NULL; - if (trans->qgroup_reserved) { - btrfs_qgroup_free(root, trans->qgroup_reserved); - trans->qgroup_reserved = 0; - } cur_trans = trans->transaction; @@ -1834,7 +1820,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, return ret; } - if (!cur_trans->dirty_bg_run) { + if (!test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &cur_trans->flags)) { int run_it = 0; /* this mutex is also taken before trying to set @@ -1843,18 +1829,17 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, * after a extents from that block group have been * allocated for cache files. btrfs_set_block_group_ro * will wait for the transaction to commit if it - * finds dirty_bg_run = 1 + * finds BTRFS_TRANS_DIRTY_BG_RUN set. * - * The dirty_bg_run flag is also used to make sure only - * one process starts all the block group IO. It wouldn't + * The BTRFS_TRANS_DIRTY_BG_RUN flag is also used to make sure + * only one process starts all the block group IO. It wouldn't * hurt to have more than one go through, but there's no * real advantage to it either. */ mutex_lock(&root->fs_info->ro_block_group_mutex); - if (!cur_trans->dirty_bg_run) { + if (!test_and_set_bit(BTRFS_TRANS_DIRTY_BG_RUN, + &cur_trans->flags)) run_it = 1; - cur_trans->dirty_bg_run = 1; - } mutex_unlock(&root->fs_info->ro_block_group_mutex); if (run_it) @@ -1866,7 +1851,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, } spin_lock(&root->fs_info->trans_lock); - list_splice_init(&trans->ordered, &cur_trans->pending_ordered); if (cur_trans->state >= TRANS_STATE_COMMIT_START) { spin_unlock(&root->fs_info->trans_lock); atomic_inc(&cur_trans->use_count); @@ -1893,8 +1877,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, spin_unlock(&root->fs_info->trans_lock); wait_for_commit(root, prev_trans); + ret = prev_trans->aborted; btrfs_put_transaction(prev_trans); + if (ret) + goto cleanup_transaction; } else { spin_unlock(&root->fs_info->trans_lock); } @@ -1922,7 +1909,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, btrfs_wait_delalloc_flush(root->fs_info); - btrfs_wait_pending_ordered(cur_trans, root->fs_info); + btrfs_wait_pending_ordered(cur_trans); btrfs_scrub_pause(root); /* @@ -2102,7 +2089,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, ret = btrfs_write_and_wait_transaction(trans, root); if (ret) { - btrfs_error(root->fs_info, ret, + btrfs_std_error(root->fs_info, ret, "Error while writing out transaction"); mutex_unlock(&root->fs_info->tree_log_mutex); goto scrub_continue; @@ -2122,7 +2109,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, btrfs_finish_extent_commit(trans, root); - if (cur_trans->have_free_bgs) + if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &cur_trans->flags)) btrfs_clear_space_info_full(root->fs_info); root->fs_info->last_trans_committed = cur_trans->transid; @@ -2164,10 +2151,6 @@ cleanup_transaction: btrfs_trans_release_metadata(trans, root); btrfs_trans_release_chunk_metadata(trans); trans->block_rsv = NULL; - if (trans->qgroup_reserved) { - btrfs_qgroup_free(root, trans->qgroup_reserved); - trans->qgroup_reserved = 0; - } btrfs_warn(root->fs_info, "Skipping commit of aborted transaction."); if (current->journal_info == trans) current->journal_info = NULL; diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index eb09c2067fa8..b05b2f64d913 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -32,6 +32,10 @@ enum btrfs_trans_state { TRANS_STATE_MAX = 6, }; +#define BTRFS_TRANS_HAVE_FREE_BGS 0 +#define BTRFS_TRANS_DIRTY_BG_RUN 1 +#define BTRFS_TRANS_CACHE_ENOSPC 2 + struct btrfs_transaction { u64 transid; /* @@ -46,11 +50,9 @@ struct btrfs_transaction { */ atomic_t num_writers; atomic_t use_count; + atomic_t pending_ordered; - /* - * true if there is free bgs operations in this transaction - */ - int have_free_bgs; + unsigned long flags; /* Be protected by fs_info->trans_lock when we want to change it. */ enum btrfs_trans_state state; @@ -59,12 +61,13 @@ struct btrfs_transaction { unsigned long start_time; wait_queue_head_t writer_wait; wait_queue_head_t commit_wait; + wait_queue_head_t pending_wait; struct list_head pending_snapshots; struct list_head pending_chunks; - struct list_head pending_ordered; struct list_head switch_commits; struct list_head dirty_bgs; struct list_head io_bgs; + struct list_head dropped_roots; u64 num_dirty_bgs; /* @@ -74,9 +77,11 @@ struct btrfs_transaction { */ struct mutex cache_write_mutex; spinlock_t dirty_bgs_lock; + struct list_head deleted_bgs; + spinlock_t deleted_bgs_lock; + spinlock_t dropped_roots_lock; struct btrfs_delayed_ref_root delayed_refs; int aborted; - int dirty_bg_run; }; #define __TRANS_FREEZABLE (1U << 0) @@ -103,7 +108,6 @@ struct btrfs_trans_handle { u64 transid; u64 bytes_reserved; u64 chunk_bytes_reserved; - u64 qgroup_reserved; unsigned long use_count; unsigned long blocks_reserved; unsigned long blocks_used; @@ -114,6 +118,7 @@ struct btrfs_trans_handle { short aborted; short adding_csums; bool allocating_chunk; + bool can_flush_pending_bgs; bool reloc_reserved; bool sync; unsigned int type; @@ -124,7 +129,6 @@ struct btrfs_trans_handle { */ struct btrfs_root *root; struct seq_list delayed_ref_elem; - struct list_head ordered; struct list_head qgroup_ref_list; struct list_head new_bgs; }; @@ -180,9 +184,10 @@ static inline void btrfs_clear_skip_qgroup(struct btrfs_trans_handle *trans) int btrfs_end_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root); struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, - int num_items); + unsigned int num_items); struct btrfs_trans_handle *btrfs_start_transaction_lflush( - struct btrfs_root *root, int num_items); + struct btrfs_root *root, + unsigned int num_items); struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root); struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root); struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root); @@ -214,5 +219,6 @@ int btrfs_transaction_blocked(struct btrfs_fs_info *info); int btrfs_transaction_in_commit(struct btrfs_fs_info *info); void btrfs_put_transaction(struct btrfs_transaction *transaction); void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info); - +void btrfs_add_dropped_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root); #endif diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c index a4b9c8b2d35a..f31db4325339 100644 --- a/fs/btrfs/tree-defrag.c +++ b/fs/btrfs/tree-defrag.c @@ -115,8 +115,7 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, ret = -EAGAIN; } out: - if (path) - btrfs_free_path(path); + btrfs_free_path(path); if (ret == -EAGAIN) { if (root->defrag_max.objectid > root->defrag_progress.objectid) goto done; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 9c45431e69ab..323e12cc9d2f 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -140,55 +140,46 @@ static int start_log_trans(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_log_ctx *ctx) { - int index; - int ret; + int ret = 0; mutex_lock(&root->log_mutex); + if (root->log_root) { if (btrfs_need_log_full_commit(root->fs_info, trans)) { ret = -EAGAIN; goto out; } + if (!root->log_start_pid) { - root->log_start_pid = current->pid; clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); + root->log_start_pid = current->pid; } else if (root->log_start_pid != current->pid) { set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); } + } else { + mutex_lock(&root->fs_info->tree_log_mutex); + if (!root->fs_info->log_root_tree) + ret = btrfs_init_log_root_tree(trans, root->fs_info); + mutex_unlock(&root->fs_info->tree_log_mutex); + if (ret) + goto out; - atomic_inc(&root->log_batch); - atomic_inc(&root->log_writers); - if (ctx) { - index = root->log_transid % 2; - list_add_tail(&ctx->list, &root->log_ctxs[index]); - ctx->log_transid = root->log_transid; - } - mutex_unlock(&root->log_mutex); - return 0; - } - - ret = 0; - mutex_lock(&root->fs_info->tree_log_mutex); - if (!root->fs_info->log_root_tree) - ret = btrfs_init_log_root_tree(trans, root->fs_info); - mutex_unlock(&root->fs_info->tree_log_mutex); - if (ret) - goto out; - - if (!root->log_root) { ret = btrfs_add_log_tree(trans, root); if (ret) goto out; + + clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); + root->log_start_pid = current->pid; } - clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); - root->log_start_pid = current->pid; + atomic_inc(&root->log_batch); atomic_inc(&root->log_writers); if (ctx) { - index = root->log_transid % 2; + int index = root->log_transid % 2; list_add_tail(&ctx->list, &root->log_ctxs[index]); ctx->log_transid = root->log_transid; } + out: mutex_unlock(&root->log_mutex); return ret; @@ -238,7 +229,9 @@ int btrfs_pin_log_trans(struct btrfs_root *root) void btrfs_end_log_trans(struct btrfs_root *root) { if (atomic_dec_and_test(&root->log_writers)) { - smp_mb(); + /* + * Implicit memory barrier after atomic_dec_and_test + */ if (waitqueue_active(&root->log_writer_wait)) wake_up(&root->log_writer_wait); } @@ -700,7 +693,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, ret = btrfs_inc_extent_ref(trans, root, ins.objectid, ins.offset, 0, root->root_key.objectid, - key->objectid, offset, 0); + key->objectid, offset); if (ret) goto out; } else { @@ -731,12 +724,66 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, &ordered_sums, 0); if (ret) goto out; + /* + * Now delete all existing cums in the csum root that + * cover our range. We do this because we can have an + * extent that is completely referenced by one file + * extent item and partially referenced by another + * file extent item (like after using the clone or + * extent_same ioctls). In this case if we end up doing + * the replay of the one that partially references the + * extent first, and we do not do the csum deletion + * below, we can get 2 csum items in the csum tree that + * overlap each other. For example, imagine our log has + * the two following file extent items: + * + * key (257 EXTENT_DATA 409600) + * extent data disk byte 12845056 nr 102400 + * extent data offset 20480 nr 20480 ram 102400 + * + * key (257 EXTENT_DATA 819200) + * extent data disk byte 12845056 nr 102400 + * extent data offset 0 nr 102400 ram 102400 + * + * Where the second one fully references the 100K extent + * that starts at disk byte 12845056, and the log tree + * has a single csum item that covers the entire range + * of the extent: + * + * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 + * + * After the first file extent item is replayed, the + * csum tree gets the following csum item: + * + * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 + * + * Which covers the 20K sub-range starting at offset 20K + * of our extent. Now when we replay the second file + * extent item, if we do not delete existing csum items + * that cover any of its blocks, we end up getting two + * csum items in our csum tree that overlap each other: + * + * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 + * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 + * + * Which is a problem, because after this anyone trying + * to lookup up for the checksum of any block of our + * extent starting at an offset of 40K or higher, will + * end up looking at the second csum item only, which + * does not contain the checksum for any block starting + * at offset 40K or higher of our extent. + */ while (!list_empty(&ordered_sums)) { struct btrfs_ordered_sum *sums; sums = list_entry(ordered_sums.next, struct btrfs_ordered_sum, list); if (!ret) + ret = btrfs_del_csums(trans, + root->fs_info->csum_root, + sums->bytenr, + sums->len); + if (!ret) ret = btrfs_csum_file_blocks(trans, root->fs_info->csum_root, sums); @@ -1549,9 +1596,8 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, */ static noinline int insert_one_name(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct btrfs_path *path, u64 dirid, u64 index, - char *name, int name_len, u8 type, + char *name, int name_len, struct btrfs_key *location) { struct inode *inode; @@ -1613,6 +1659,9 @@ static bool name_in_log_ref(struct btrfs_root *log_root, * not exist in the FS, it is skipped. fsyncs on directories * do not force down inodes inside that directory, just changes to the * names or unlinks in a directory. + * + * Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a + * non-existing inode) and 1 if the name was replayed. */ static noinline int replay_one_name(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -1631,6 +1680,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, int exists; int ret = 0; bool update_size = (key->type == BTRFS_DIR_INDEX_KEY); + bool name_added = false; dir = read_one_inode(root, key->objectid); if (!dir) @@ -1708,6 +1758,8 @@ out: } kfree(name); iput(dir); + if (!ret && name_added) + ret = 1; return ret; insert: @@ -1719,10 +1771,12 @@ insert: goto out; } btrfs_release_path(path); - ret = insert_one_name(trans, root, path, key->objectid, key->offset, - name, name_len, log_type, &log_key); + ret = insert_one_name(trans, root, key->objectid, key->offset, + name, name_len, &log_key); if (ret && ret != -ENOENT && ret != -EEXIST) goto out; + if (!ret) + name_added = true; update_size = false; ret = 0; goto out; @@ -1740,12 +1794,13 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, struct extent_buffer *eb, int slot, struct btrfs_key *key) { - int ret; + int ret = 0; u32 item_size = btrfs_item_size_nr(eb, slot); struct btrfs_dir_item *di; int name_len; unsigned long ptr; unsigned long ptr_end; + struct btrfs_path *fixup_path = NULL; ptr = btrfs_item_ptr_offset(eb, slot); ptr_end = ptr + item_size; @@ -1755,12 +1810,59 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, return -EIO; name_len = btrfs_dir_name_len(eb, di); ret = replay_one_name(trans, root, path, eb, di, key); - if (ret) - return ret; + if (ret < 0) + break; ptr = (unsigned long)(di + 1); ptr += name_len; + + /* + * If this entry refers to a non-directory (directories can not + * have a link count > 1) and it was added in the transaction + * that was not committed, make sure we fixup the link count of + * the inode it the entry points to. Otherwise something like + * the following would result in a directory pointing to an + * inode with a wrong link that does not account for this dir + * entry: + * + * mkdir testdir + * touch testdir/foo + * touch testdir/bar + * sync + * + * ln testdir/bar testdir/bar_link + * ln testdir/foo testdir/foo_link + * xfs_io -c "fsync" testdir/bar + * + * <power failure> + * + * mount fs, log replay happens + * + * File foo would remain with a link count of 1 when it has two + * entries pointing to it in the directory testdir. This would + * make it impossible to ever delete the parent directory has + * it would result in stale dentries that can never be deleted. + */ + if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) { + struct btrfs_key di_key; + + if (!fixup_path) { + fixup_path = btrfs_alloc_path(); + if (!fixup_path) { + ret = -ENOMEM; + break; + } + } + + btrfs_dir_item_key_to_cpu(eb, di, &di_key); + ret = link_to_fixup_dir(trans, root, fixup_path, + di_key.objectid); + if (ret) + break; + } + ret = 0; } - return 0; + btrfs_free_path(fixup_path); + return ret; } /* @@ -2535,8 +2637,7 @@ static int update_log_root(struct btrfs_trans_handle *trans, return ret; } -static void wait_log_commit(struct btrfs_trans_handle *trans, - struct btrfs_root *root, int transid) +static void wait_log_commit(struct btrfs_root *root, int transid) { DEFINE_WAIT(wait); int index = transid % 2; @@ -2561,8 +2662,7 @@ static void wait_log_commit(struct btrfs_trans_handle *trans, atomic_read(&root->log_commit[index])); } -static void wait_for_writer(struct btrfs_trans_handle *trans, - struct btrfs_root *root) +static void wait_for_writer(struct btrfs_root *root) { DEFINE_WAIT(wait); @@ -2642,7 +2742,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, index1 = log_transid % 2; if (atomic_read(&root->log_commit[index1])) { - wait_log_commit(trans, root, log_transid); + wait_log_commit(root, log_transid); mutex_unlock(&root->log_mutex); return ctx->log_ret; } @@ -2651,7 +2751,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, /* wait for previous tree log sync to complete */ if (atomic_read(&root->log_commit[(index1 + 1) % 2])) - wait_log_commit(trans, root, log_transid - 1); + wait_log_commit(root, log_transid - 1); while (1) { int batch = atomic_read(&root->log_batch); @@ -2662,7 +2762,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, schedule_timeout_uninterruptible(1); mutex_lock(&root->log_mutex); } - wait_for_writer(trans, root); + wait_for_writer(root); if (batch == atomic_read(&root->log_batch)) break; } @@ -2722,7 +2822,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, mutex_lock(&log_root_tree->log_mutex); if (atomic_dec_and_test(&log_root_tree->log_writers)) { - smp_mb(); + /* + * Implicit memory barrier after atomic_dec_and_test + */ if (waitqueue_active(&log_root_tree->log_writer_wait)) wake_up(&log_root_tree->log_writer_wait); } @@ -2759,7 +2861,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, ret = btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); btrfs_wait_logged_extents(trans, log, log_transid); - wait_log_commit(trans, log_root_tree, + wait_log_commit(log_root_tree, root_log_ctx.log_transid); mutex_unlock(&log_root_tree->log_mutex); if (!ret) @@ -2770,11 +2872,11 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, atomic_set(&log_root_tree->log_commit[index2], 1); if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) { - wait_log_commit(trans, log_root_tree, + wait_log_commit(log_root_tree, root_log_ctx.log_transid - 1); } - wait_for_writer(trans, log_root_tree); + wait_for_writer(log_root_tree); /* * now that we've moved on to the tree of log tree roots, @@ -2852,6 +2954,9 @@ out_wake_log_root: atomic_set(&log_root_tree->log_commit[index2], 0); mutex_unlock(&log_root_tree->log_mutex); + /* + * The barrier before waitqueue_active is implied by mutex_unlock + */ if (waitqueue_active(&log_root_tree->log_commit_wait[index2])) wake_up(&log_root_tree->log_commit_wait[index2]); out: @@ -2863,6 +2968,9 @@ out: atomic_set(&root->log_commit[index1], 0); mutex_unlock(&root->log_mutex); + /* + * The barrier before waitqueue_active is implied by mutex_unlock + */ if (waitqueue_active(&root->log_commit_wait[index1])) wake_up(&root->log_commit_wait[index1]); return ret; @@ -4904,6 +5012,94 @@ next_dir_inode: return ret; } +static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, + struct inode *inode, + struct btrfs_log_ctx *ctx) +{ + int ret; + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_root *root = BTRFS_I(inode)->root; + const u64 ino = btrfs_ino(inode); + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + path->skip_locking = 1; + path->search_commit_root = 1; + + key.objectid = ino; + key.type = BTRFS_INODE_REF_KEY; + key.offset = 0; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + + while (true) { + struct extent_buffer *leaf = path->nodes[0]; + int slot = path->slots[0]; + u32 cur_offset = 0; + u32 item_size; + unsigned long ptr; + + if (slot >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto out; + else if (ret > 0) + break; + continue; + } + + btrfs_item_key_to_cpu(leaf, &key, slot); + /* BTRFS_INODE_EXTREF_KEY is BTRFS_INODE_REF_KEY + 1 */ + if (key.objectid != ino || key.type > BTRFS_INODE_EXTREF_KEY) + break; + + item_size = btrfs_item_size_nr(leaf, slot); + ptr = btrfs_item_ptr_offset(leaf, slot); + while (cur_offset < item_size) { + struct btrfs_key inode_key; + struct inode *dir_inode; + + inode_key.type = BTRFS_INODE_ITEM_KEY; + inode_key.offset = 0; + + if (key.type == BTRFS_INODE_EXTREF_KEY) { + struct btrfs_inode_extref *extref; + + extref = (struct btrfs_inode_extref *) + (ptr + cur_offset); + inode_key.objectid = btrfs_inode_extref_parent( + leaf, extref); + cur_offset += sizeof(*extref); + cur_offset += btrfs_inode_extref_name_len(leaf, + extref); + } else { + inode_key.objectid = key.offset; + cur_offset = item_size; + } + + dir_inode = btrfs_iget(root->fs_info->sb, &inode_key, + root, NULL); + /* If parent inode was deleted, skip it. */ + if (IS_ERR(dir_inode)) + continue; + + ret = btrfs_log_inode(trans, root, dir_inode, + LOG_INODE_ALL, 0, LLONG_MAX, ctx); + iput(dir_inode); + if (ret) + goto out; + } + path->slots[0]++; + } + ret = 0; +out: + btrfs_free_path(path); + return ret; +} + /* * helper function around btrfs_log_inode to make sure newly created * parent directories also end up in the log. A minimal inode and backref @@ -4923,9 +5119,6 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, struct dentry *old_parent = NULL; int ret = 0; u64 last_committed = root->fs_info->last_trans_committed; - const struct dentry * const first_parent = parent; - const bool did_unlink = (BTRFS_I(inode)->last_unlink_trans > - last_committed); bool log_dentries = false; struct inode *orig_inode = inode; @@ -4986,6 +5179,53 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, if (S_ISDIR(inode->i_mode) && ctx && ctx->log_new_dentries) log_dentries = true; + /* + * On unlink we must make sure all our current and old parent directores + * inodes are fully logged. This is to prevent leaving dangling + * directory index entries in directories that were our parents but are + * not anymore. Not doing this results in old parent directory being + * impossible to delete after log replay (rmdir will always fail with + * error -ENOTEMPTY). + * + * Example 1: + * + * mkdir testdir + * touch testdir/foo + * ln testdir/foo testdir/bar + * sync + * unlink testdir/bar + * xfs_io -c fsync testdir/foo + * <power failure> + * mount fs, triggers log replay + * + * If we don't log the parent directory (testdir), after log replay the + * directory still has an entry pointing to the file inode using the bar + * name, but a matching BTRFS_INODE_[REF|EXTREF]_KEY does not exist and + * the file inode has a link count of 1. + * + * Example 2: + * + * mkdir testdir + * touch foo + * ln foo testdir/foo2 + * ln foo testdir/foo3 + * sync + * unlink testdir/foo3 + * xfs_io -c fsync foo + * <power failure> + * mount fs, triggers log replay + * + * Similar as the first example, after log replay the parent directory + * testdir still has an entry pointing to the inode file with name foo3 + * but the file inode does not have a matching BTRFS_INODE_REF_KEY item + * and has a link count of 2. + */ + if (BTRFS_I(inode)->last_unlink_trans > last_committed) { + ret = btrfs_log_all_parents(trans, orig_inode, ctx); + if (ret) + goto end_trans; + } + while (1) { if (!parent || d_really_is_negative(parent) || sb != d_inode(parent)->i_sb) break; @@ -4994,23 +5234,9 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, if (root != BTRFS_I(inode)->root) break; - /* - * On unlink we must make sure our immediate parent directory - * inode is fully logged. This is to prevent leaving dangling - * directory index entries and a wrong directory inode's i_size. - * Not doing so can result in a directory being impossible to - * delete after log replay (rmdir will always fail with error - * -ENOTEMPTY). - */ - if (did_unlink && parent == first_parent) - inode_only = LOG_INODE_ALL; - else - inode_only = LOG_INODE_EXISTS; - - if (BTRFS_I(inode)->generation > - root->fs_info->last_trans_committed || - inode_only == LOG_INODE_ALL) { - ret = btrfs_log_inode(trans, root, inode, inode_only, + if (BTRFS_I(inode)->generation > last_committed) { + ret = btrfs_log_inode(trans, root, inode, + LOG_INODE_EXISTS, 0, LLONG_MAX, ctx); if (ret) goto end_trans; @@ -5098,7 +5324,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) ret = walk_log_tree(trans, log_root_tree, &wc); if (ret) { - btrfs_error(fs_info, ret, "Failed to pin buffers while " + btrfs_std_error(fs_info, ret, "Failed to pin buffers while " "recovering log root tree."); goto error; } @@ -5112,7 +5338,7 @@ again: ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); if (ret < 0) { - btrfs_error(fs_info, ret, + btrfs_std_error(fs_info, ret, "Couldn't find tree log root."); goto error; } @@ -5130,7 +5356,7 @@ again: log = btrfs_read_fs_root(log_root_tree, &found_key); if (IS_ERR(log)) { ret = PTR_ERR(log); - btrfs_error(fs_info, ret, + btrfs_std_error(fs_info, ret, "Couldn't read tree log root."); goto error; } @@ -5145,7 +5371,7 @@ again: free_extent_buffer(log->node); free_extent_buffer(log->commit_root); kfree(log); - btrfs_error(fs_info, ret, "Couldn't read target root " + btrfs_std_error(fs_info, ret, "Couldn't read target root " "for tree log recovery."); goto error; } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index fbe7c104531c..17ed76d18eb6 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -42,6 +42,82 @@ #include "dev-replace.h" #include "sysfs.h" +const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { + [BTRFS_RAID_RAID10] = { + .sub_stripes = 2, + .dev_stripes = 1, + .devs_max = 0, /* 0 == as many as possible */ + .devs_min = 4, + .tolerated_failures = 1, + .devs_increment = 2, + .ncopies = 2, + }, + [BTRFS_RAID_RAID1] = { + .sub_stripes = 1, + .dev_stripes = 1, + .devs_max = 2, + .devs_min = 2, + .tolerated_failures = 1, + .devs_increment = 2, + .ncopies = 2, + }, + [BTRFS_RAID_DUP] = { + .sub_stripes = 1, + .dev_stripes = 2, + .devs_max = 1, + .devs_min = 1, + .tolerated_failures = 0, + .devs_increment = 1, + .ncopies = 2, + }, + [BTRFS_RAID_RAID0] = { + .sub_stripes = 1, + .dev_stripes = 1, + .devs_max = 0, + .devs_min = 2, + .tolerated_failures = 0, + .devs_increment = 1, + .ncopies = 1, + }, + [BTRFS_RAID_SINGLE] = { + .sub_stripes = 1, + .dev_stripes = 1, + .devs_max = 1, + .devs_min = 1, + .tolerated_failures = 0, + .devs_increment = 1, + .ncopies = 1, + }, + [BTRFS_RAID_RAID5] = { + .sub_stripes = 1, + .dev_stripes = 1, + .devs_max = 0, + .devs_min = 2, + .tolerated_failures = 1, + .devs_increment = 1, + .ncopies = 2, + }, + [BTRFS_RAID_RAID6] = { + .sub_stripes = 1, + .dev_stripes = 1, + .devs_max = 0, + .devs_min = 3, + .tolerated_failures = 2, + .devs_increment = 1, + .ncopies = 3, + }, +}; + +const u64 const btrfs_raid_group[BTRFS_NR_RAID_TYPES] = { + [BTRFS_RAID_RAID10] = BTRFS_BLOCK_GROUP_RAID10, + [BTRFS_RAID_RAID1] = BTRFS_BLOCK_GROUP_RAID1, + [BTRFS_RAID_DUP] = BTRFS_BLOCK_GROUP_DUP, + [BTRFS_RAID_RAID0] = BTRFS_BLOCK_GROUP_RAID0, + [BTRFS_RAID_SINGLE] = 0, + [BTRFS_RAID_RAID5] = BTRFS_BLOCK_GROUP_RAID5, + [BTRFS_RAID_RAID6] = BTRFS_BLOCK_GROUP_RAID6, +}; + static int init_first_rw_device(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_device *device); @@ -198,7 +274,6 @@ btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder, if (IS_ERR(*bdev)) { ret = PTR_ERR(*bdev); - printk(KERN_INFO "BTRFS: open %s failed\n", device_path); goto error; } @@ -211,8 +286,8 @@ btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder, } invalidate_bdev(*bdev); *bh = btrfs_read_dev_super(*bdev); - if (!*bh) { - ret = -EINVAL; + if (IS_ERR(*bh)) { + ret = PTR_ERR(*bh); blkdev_put(*bdev, flags); goto error; } @@ -345,6 +420,9 @@ loop_lock: pending = pending->bi_next; cur->bi_next = NULL; + /* + * atomic_dec_return implies a barrier for waitqueue_active + */ if (atomic_dec_return(&fs_info->nr_async_bios) < limit && waitqueue_active(&fs_info->async_submit_wait)) wake_up(&fs_info->async_submit_wait); @@ -765,36 +843,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) mutex_lock(&fs_devices->device_list_mutex); list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) { - struct btrfs_device *new_device; - struct rcu_string *name; - - if (device->bdev) - fs_devices->open_devices--; - - if (device->writeable && - device->devid != BTRFS_DEV_REPLACE_DEVID) { - list_del_init(&device->dev_alloc_list); - fs_devices->rw_devices--; - } - - if (device->missing) - fs_devices->missing_devices--; - - new_device = btrfs_alloc_device(NULL, &device->devid, - device->uuid); - BUG_ON(IS_ERR(new_device)); /* -ENOMEM */ - - /* Safe because we are under uuid_mutex */ - if (device->name) { - name = rcu_string_strdup(device->name->str, GFP_NOFS); - BUG_ON(!name); /* -ENOMEM */ - rcu_assign_pointer(new_device->name, name); - } - - list_replace_rcu(&device->dev_list, &new_device->dev_list); - new_device->fs_devices = device->fs_devices; - - call_rcu(&device->rcu, free_device); + btrfs_close_one_device(device); } mutex_unlock(&fs_devices->device_list_mutex); @@ -1116,15 +1165,18 @@ out: return ret; } -static int contains_pending_extent(struct btrfs_trans_handle *trans, +static int contains_pending_extent(struct btrfs_transaction *transaction, struct btrfs_device *device, u64 *start, u64 len) { + struct btrfs_fs_info *fs_info = device->dev_root->fs_info; struct extent_map *em; - struct list_head *search_list = &trans->transaction->pending_chunks; + struct list_head *search_list = &fs_info->pinned_chunks; int ret = 0; u64 physical_start = *start; + if (transaction) + search_list = &transaction->pending_chunks; again: list_for_each_entry(em, search_list, list) { struct map_lookup *map; @@ -1159,8 +1211,8 @@ again: } } } - if (search_list == &trans->transaction->pending_chunks) { - search_list = &trans->root->fs_info->pinned_chunks; + if (search_list != &fs_info->pinned_chunks) { + search_list = &fs_info->pinned_chunks; goto again; } @@ -1169,12 +1221,13 @@ again: /* - * find_free_dev_extent - find free space in the specified device - * @device: the device which we search the free space in - * @num_bytes: the size of the free space that we need - * @start: store the start of the free space. - * @len: the size of the free space. that we find, or the size of the max - * free space if we don't find suitable free space + * find_free_dev_extent_start - find free space in the specified device + * @device: the device which we search the free space in + * @num_bytes: the size of the free space that we need + * @search_start: the position from which to begin the search + * @start: store the start of the free space. + * @len: the size of the free space. that we find, or the size + * of the max free space if we don't find suitable free space * * this uses a pretty simple search, the expectation is that it is * called very infrequently and that a given device has a small number @@ -1188,9 +1241,9 @@ again: * But if we don't find suitable free space, it is used to store the size of * the max free space. */ -int find_free_dev_extent(struct btrfs_trans_handle *trans, - struct btrfs_device *device, u64 num_bytes, - u64 *start, u64 *len) +int find_free_dev_extent_start(struct btrfs_transaction *transaction, + struct btrfs_device *device, u64 num_bytes, + u64 search_start, u64 *start, u64 *len) { struct btrfs_key key; struct btrfs_root *root = device->dev_root; @@ -1200,19 +1253,11 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans, u64 max_hole_start; u64 max_hole_size; u64 extent_end; - u64 search_start; u64 search_end = device->total_bytes; int ret; int slot; struct extent_buffer *l; - /* FIXME use last free of some kind */ - - /* we don't want to overwrite the superblock on the drive, - * so we make sure to start at an offset of at least 1MB - */ - search_start = max(root->fs_info->alloc_start, 1024ull * 1024); - path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -1273,7 +1318,7 @@ again: * Have to check before we set max_hole_start, otherwise * we could end up sending back this offset anyway. */ - if (contains_pending_extent(trans, device, + if (contains_pending_extent(transaction, device, &search_start, hole_size)) { if (key.offset >= search_start) { @@ -1322,7 +1367,7 @@ next: if (search_end > search_start) { hole_size = search_end - search_start; - if (contains_pending_extent(trans, device, &search_start, + if (contains_pending_extent(transaction, device, &search_start, hole_size)) { btrfs_release_path(path); goto again; @@ -1348,6 +1393,24 @@ out: return ret; } +int find_free_dev_extent(struct btrfs_trans_handle *trans, + struct btrfs_device *device, u64 num_bytes, + u64 *start, u64 *len) +{ + struct btrfs_root *root = device->dev_root; + u64 search_start; + + /* FIXME use last free of some kind */ + + /* + * we don't want to overwrite the superblock on the drive, + * so we make sure to start at an offset of at least 1MB + */ + search_start = max(root->fs_info->alloc_start, 1024ull * 1024); + return find_free_dev_extent_start(trans->transaction, device, + num_bytes, search_start, start, len); +} + static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, struct btrfs_device *device, u64 start, u64 *dev_extent_len) @@ -1388,7 +1451,7 @@ again: extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent); } else { - btrfs_error(root->fs_info, ret, "Slot search failed"); + btrfs_std_error(root->fs_info, ret, "Slot search failed"); goto out; } @@ -1396,10 +1459,10 @@ again: ret = btrfs_del_item(trans, root, path); if (ret) { - btrfs_error(root->fs_info, ret, + btrfs_std_error(root->fs_info, ret, "Failed to remove dev extent item"); } else { - trans->transaction->have_free_bgs = 1; + set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags); } out: btrfs_free_path(path); @@ -1787,7 +1850,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) if (device->bdev) { device->fs_devices->open_devices--; /* remove sysfs entry */ - btrfs_kobj_rm_device(root->fs_info->fs_devices, device); + btrfs_sysfs_rm_device_link(root->fs_info->fs_devices, device); } call_rcu(&device->rcu, free_device); @@ -1910,7 +1973,8 @@ void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info, if (srcdev->writeable) { fs_devices->rw_devices--; /* zero out the old super if it is writable */ - btrfs_scratch_superblock(srcdev); + btrfs_scratch_superblocks(srcdev->bdev, + rcu_str_deref(srcdev->name)); } if (srcdev->bdev) @@ -1957,10 +2021,11 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, WARN_ON(!tgtdev); mutex_lock(&fs_info->fs_devices->device_list_mutex); - btrfs_kobj_rm_device(fs_info->fs_devices, tgtdev); + btrfs_sysfs_rm_device_link(fs_info->fs_devices, tgtdev); if (tgtdev->bdev) { - btrfs_scratch_superblock(tgtdev); + btrfs_scratch_superblocks(tgtdev->bdev, + rcu_str_deref(tgtdev->name)); fs_info->fs_devices->open_devices--; } fs_info->fs_devices->num_devices--; @@ -2027,10 +2092,8 @@ int btrfs_find_device_missing_or_by_path(struct btrfs_root *root, } } - if (!*device) { - btrfs_err(root->fs_info, "no missing device found"); - return -ENOENT; - } + if (!*device) + return BTRFS_ERROR_DEV_MISSING_NOT_FOUND; return 0; } else { @@ -2295,7 +2358,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) tmp + 1); /* add sysfs device entry */ - btrfs_kobj_add_device(root->fs_info->fs_devices, device); + btrfs_sysfs_add_device_link(root->fs_info->fs_devices, device); /* * we've got more storage, clear any full flags on the space @@ -2336,9 +2399,10 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) */ snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU", root->fs_info->fsid); - if (kobject_rename(&root->fs_info->fs_devices->super_kobj, + if (kobject_rename(&root->fs_info->fs_devices->fsid_kobj, fsid_buf)) - pr_warn("BTRFS: sysfs: failed to create fsid for sprout\n"); + btrfs_warn(root->fs_info, + "sysfs: failed to create fsid for sprout"); } root->fs_info->num_tolerated_disk_barrier_failures = @@ -2354,7 +2418,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) ret = btrfs_relocate_sys_chunks(root); if (ret < 0) - btrfs_error(root->fs_info, ret, + btrfs_std_error(root->fs_info, ret, "Failed to relocate sys chunks after " "device initialization. This can be fixed " "using the \"btrfs balance\" command."); @@ -2374,7 +2438,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) error_trans: btrfs_end_transaction(trans, root); rcu_string_free(device->name); - btrfs_kobj_rm_device(root->fs_info->fs_devices, device); + btrfs_sysfs_rm_device_link(root->fs_info->fs_devices, device); kfree(device); error: blkdev_put(bdev, FMODE_EXCL); @@ -2599,7 +2663,7 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans, if (ret < 0) goto out; else if (ret > 0) { /* Logic error or corruption */ - btrfs_error(root->fs_info, -ENOENT, + btrfs_std_error(root->fs_info, -ENOENT, "Failed lookup while freeing chunk."); ret = -ENOENT; goto out; @@ -2607,7 +2671,7 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans, ret = btrfs_del_item(trans, root, path); if (ret < 0) - btrfs_error(root->fs_info, ret, + btrfs_std_error(root->fs_info, ret, "Failed to delete chunk item."); out: btrfs_free_path(path); @@ -2755,9 +2819,7 @@ out: return ret; } -static int btrfs_relocate_chunk(struct btrfs_root *root, - u64 chunk_objectid, - u64 chunk_offset) +static int btrfs_relocate_chunk(struct btrfs_root *root, u64 chunk_offset) { struct btrfs_root *extent_root; struct btrfs_trans_handle *trans; @@ -2785,14 +2847,16 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, return -ENOSPC; /* step one, relocate all the extents inside this chunk */ + btrfs_scrub_pause(root); ret = btrfs_relocate_block_group(extent_root, chunk_offset); + btrfs_scrub_continue(root); if (ret) return ret; trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); - btrfs_std_error(root->fs_info, ret); + btrfs_std_error(root->fs_info, ret, NULL); return ret; } @@ -2855,7 +2919,6 @@ again: if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) { ret = btrfs_relocate_chunk(chunk_root, - found_key.objectid, found_key.offset); if (ret == -ENOSPC) failed++; @@ -2996,16 +3059,19 @@ static void update_balance_args(struct btrfs_balance_control *bctl) * (albeit full) chunks. */ if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) && + !(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) { bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE; bctl->data.usage = 90; } if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) && + !(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) { bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE; bctl->sys.usage = 90; } if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) && + !(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) { bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE; bctl->meta.usage = 90; @@ -3061,13 +3127,46 @@ static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, struct btrfs_balance_args *bargs) { struct btrfs_block_group_cache *cache; + u64 chunk_used; + u64 user_thresh_min; + u64 user_thresh_max; + int ret = 1; + + cache = btrfs_lookup_block_group(fs_info, chunk_offset); + chunk_used = btrfs_block_group_used(&cache->item); + + if (bargs->usage_min == 0) + user_thresh_min = 0; + else + user_thresh_min = div_factor_fine(cache->key.offset, + bargs->usage_min); + + if (bargs->usage_max == 0) + user_thresh_max = 1; + else if (bargs->usage_max > 100) + user_thresh_max = cache->key.offset; + else + user_thresh_max = div_factor_fine(cache->key.offset, + bargs->usage_max); + + if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max) + ret = 0; + + btrfs_put_block_group(cache); + return ret; +} + +static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, + u64 chunk_offset, struct btrfs_balance_args *bargs) +{ + struct btrfs_block_group_cache *cache; u64 chunk_used, user_thresh; int ret = 1; cache = btrfs_lookup_block_group(fs_info, chunk_offset); chunk_used = btrfs_block_group_used(&cache->item); - if (bargs->usage == 0) + if (bargs->usage_min == 0) user_thresh = 1; else if (bargs->usage > 100) user_thresh = cache->key.offset; @@ -3157,6 +3256,19 @@ static int chunk_vrange_filter(struct extent_buffer *leaf, return 1; } +static int chunk_stripes_range_filter(struct extent_buffer *leaf, + struct btrfs_chunk *chunk, + struct btrfs_balance_args *bargs) +{ + int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); + + if (bargs->stripes_min <= num_stripes + && num_stripes <= bargs->stripes_max) + return 0; + + return 1; +} + static int chunk_soft_convert_filter(u64 chunk_type, struct btrfs_balance_args *bargs) { @@ -3203,6 +3315,9 @@ static int should_balance_chunk(struct btrfs_root *root, if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) && chunk_usage_filter(bctl->fs_info, chunk_offset, bargs)) { return 0; + } else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && + chunk_usage_range_filter(bctl->fs_info, chunk_offset, bargs)) { + return 0; } /* devid filter */ @@ -3223,6 +3338,12 @@ static int should_balance_chunk(struct btrfs_root *root, return 0; } + /* stripes filter */ + if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) && + chunk_stripes_range_filter(leaf, chunk, bargs)) { + return 0; + } + /* soft profile changing mode */ if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) && chunk_soft_convert_filter(chunk_type, bargs)) { @@ -3237,6 +3358,16 @@ static int should_balance_chunk(struct btrfs_root *root, return 0; else bargs->limit--; + } else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) { + /* + * Same logic as the 'limit' filter; the minimum cannot be + * determined here because we do not have the global informatoin + * about the count of all chunks that satisfy the filters. + */ + if (bargs->limit_max == 0) + return 0; + else + bargs->limit_max--; } return 1; @@ -3251,6 +3382,7 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) struct btrfs_device *device; u64 old_size; u64 size_to_free; + u64 chunk_type; struct btrfs_chunk *chunk; struct btrfs_path *path; struct btrfs_key key; @@ -3261,9 +3393,13 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) int ret; int enospc_errors = 0; bool counting = true; + /* The single value limit and min/max limits use the same bytes in the */ u64 limit_data = bctl->data.limit; u64 limit_meta = bctl->meta.limit; u64 limit_sys = bctl->sys.limit; + u32 count_data = 0; + u32 count_meta = 0; + u32 count_sys = 0; /* step one make some room on all the devices */ devices = &fs_info->fs_devices->devices; @@ -3304,6 +3440,10 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) spin_unlock(&fs_info->balance_lock); again: if (!counting) { + /* + * The single value limit and min/max limits use the same bytes + * in the + */ bctl->data.limit = limit_data; bctl->meta.limit = limit_meta; bctl->sys.limit = limit_sys; @@ -3351,6 +3491,7 @@ again: } chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); + chunk_type = btrfs_chunk_type(leaf, chunk); if (!counting) { spin_lock(&fs_info->balance_lock); @@ -3371,11 +3512,32 @@ again: spin_lock(&fs_info->balance_lock); bctl->stat.expected++; spin_unlock(&fs_info->balance_lock); + + if (chunk_type & BTRFS_BLOCK_GROUP_DATA) + count_data++; + else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) + count_sys++; + else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA) + count_meta++; + + goto loop; + } + + /* + * Apply limit_min filter, no need to check if the LIMITS + * filter is used, limit_min is 0 by default + */ + if (((chunk_type & BTRFS_BLOCK_GROUP_DATA) && + count_data < bctl->data.limit_min) + || ((chunk_type & BTRFS_BLOCK_GROUP_METADATA) && + count_meta < bctl->meta.limit_min) + || ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) && + count_sys < bctl->sys.limit_min)) { + mutex_unlock(&fs_info->delete_unused_bgs_mutex); goto loop; } ret = btrfs_relocate_chunk(chunk_root, - found_key.objectid, found_key.offset); mutex_unlock(&fs_info->delete_unused_bgs_mutex); if (ret && ret != -ENOSPC) @@ -3449,11 +3611,20 @@ static void __cancel_balance(struct btrfs_fs_info *fs_info) unset_balance_control(fs_info); ret = del_balance_item(fs_info->tree_root); if (ret) - btrfs_std_error(fs_info, ret); + btrfs_std_error(fs_info, ret, NULL); atomic_set(&fs_info->mutually_exclusive_operation_running, 0); } +/* Non-zero return value signifies invalidity */ +static inline int validate_convert_profile(struct btrfs_balance_args *bctl_arg, + u64 allowed) +{ + return ((bctl_arg->flags & BTRFS_BALANCE_ARGS_CONVERT) && + (!alloc_profile_is_valid(bctl_arg->target, 1) || + (bctl_arg->target & ~allowed))); +} + /* * Should be called with both balance and volume mutexes held */ @@ -3511,27 +3682,21 @@ int btrfs_balance(struct btrfs_balance_control *bctl, if (num_devices > 3) allowed |= (BTRFS_BLOCK_GROUP_RAID10 | BTRFS_BLOCK_GROUP_RAID6); - if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) && - (!alloc_profile_is_valid(bctl->data.target, 1) || - (bctl->data.target & ~allowed))) { + if (validate_convert_profile(&bctl->data, allowed)) { btrfs_err(fs_info, "unable to start balance with target " "data profile %llu", bctl->data.target); ret = -EINVAL; goto out; } - if ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && - (!alloc_profile_is_valid(bctl->meta.target, 1) || - (bctl->meta.target & ~allowed))) { + if (validate_convert_profile(&bctl->meta, allowed)) { btrfs_err(fs_info, "unable to start balance with target metadata profile %llu", bctl->meta.target); ret = -EINVAL; goto out; } - if ((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && - (!alloc_profile_is_valid(bctl->sys.target, 1) || - (bctl->sys.target & ~allowed))) { + if (validate_convert_profile(&bctl->sys, allowed)) { btrfs_err(fs_info, "unable to start balance with target system profile %llu", bctl->sys.target); @@ -3573,23 +3738,10 @@ int btrfs_balance(struct btrfs_balance_control *bctl, } while (read_seqretry(&fs_info->profiles_lock, seq)); if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { - int num_tolerated_disk_barrier_failures; - u64 target = bctl->sys.target; - - num_tolerated_disk_barrier_failures = - btrfs_calc_num_tolerated_disk_barrier_failures(fs_info); - if (num_tolerated_disk_barrier_failures > 0 && - (target & - (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 | - BTRFS_AVAIL_ALLOC_BIT_SINGLE))) - num_tolerated_disk_barrier_failures = 0; - else if (num_tolerated_disk_barrier_failures > 1 && - (target & - (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10))) - num_tolerated_disk_barrier_failures = 1; - - fs_info->num_tolerated_disk_barrier_failures = - num_tolerated_disk_barrier_failures; + fs_info->num_tolerated_disk_barrier_failures = min( + btrfs_calc_num_tolerated_disk_barrier_failures(fs_info), + btrfs_get_num_tolerated_disk_barrier_failures( + bctl->sys.target)); } ret = insert_balance_item(fs_info->tree_root, bctl); @@ -4077,7 +4229,6 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) struct btrfs_dev_extent *dev_extent = NULL; struct btrfs_path *path; u64 length; - u64 chunk_objectid; u64 chunk_offset; int ret; int slot; @@ -4154,11 +4305,10 @@ again: break; } - chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent); chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); btrfs_release_path(path); - ret = btrfs_relocate_chunk(root, chunk_objectid, chunk_offset); + ret = btrfs_relocate_chunk(root, chunk_offset); mutex_unlock(&root->fs_info->delete_unused_bgs_mutex); if (ret && ret != -ENOSPC) goto done; @@ -4200,7 +4350,8 @@ again: u64 start = new_size; u64 len = old_size - new_size; - if (contains_pending_extent(trans, device, &start, len)) { + if (contains_pending_extent(trans->transaction, device, + &start, len)) { unlock_chunks(root); checked_pending_chunks = true; failed = 0; @@ -4287,65 +4438,6 @@ static int btrfs_cmp_device_info(const void *a, const void *b) return 0; } -static const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { - [BTRFS_RAID_RAID10] = { - .sub_stripes = 2, - .dev_stripes = 1, - .devs_max = 0, /* 0 == as many as possible */ - .devs_min = 4, - .devs_increment = 2, - .ncopies = 2, - }, - [BTRFS_RAID_RAID1] = { - .sub_stripes = 1, - .dev_stripes = 1, - .devs_max = 2, - .devs_min = 2, - .devs_increment = 2, - .ncopies = 2, - }, - [BTRFS_RAID_DUP] = { - .sub_stripes = 1, - .dev_stripes = 2, - .devs_max = 1, - .devs_min = 1, - .devs_increment = 1, - .ncopies = 2, - }, - [BTRFS_RAID_RAID0] = { - .sub_stripes = 1, - .dev_stripes = 1, - .devs_max = 0, - .devs_min = 2, - .devs_increment = 1, - .ncopies = 1, - }, - [BTRFS_RAID_SINGLE] = { - .sub_stripes = 1, - .dev_stripes = 1, - .devs_max = 1, - .devs_min = 1, - .devs_increment = 1, - .ncopies = 1, - }, - [BTRFS_RAID_RAID5] = { - .sub_stripes = 1, - .dev_stripes = 1, - .devs_max = 0, - .devs_min = 2, - .devs_increment = 1, - .ncopies = 2, - }, - [BTRFS_RAID_RAID6] = { - .sub_stripes = 1, - .dev_stripes = 1, - .devs_max = 0, - .devs_min = 3, - .devs_increment = 1, - .ncopies = 3, - }, -}; - static u32 find_raid56_stripe_len(u32 data_devices, u32 dev_stripe_target) { /* TODO allow them to set a preferred stripe size */ @@ -5071,9 +5163,7 @@ static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes) * and the stripes */ sizeof(u64) * (total_stripes), - GFP_NOFS); - if (!bbio) - return NULL; + GFP_NOFS|__GFP_NOFAIL); atomic_set(&bbio->error, 0); atomic_set(&bbio->refs, 1); @@ -5741,23 +5831,23 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, return 0; } -static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio, int err) +static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio) { bio->bi_private = bbio->private; bio->bi_end_io = bbio->end_io; - bio_endio(bio, err); + bio_endio(bio); btrfs_put_bbio(bbio); } -static void btrfs_end_bio(struct bio *bio, int err) +static void btrfs_end_bio(struct bio *bio) { struct btrfs_bio *bbio = bio->bi_private; int is_orig_bio = 0; - if (err) { + if (bio->bi_error) { atomic_inc(&bbio->error); - if (err == -EIO || err == -EREMOTEIO) { + if (bio->bi_error == -EIO || bio->bi_error == -EREMOTEIO) { unsigned int stripe_index = btrfs_io_bio(bio)->stripe_index; struct btrfs_device *dev; @@ -5795,17 +5885,16 @@ static void btrfs_end_bio(struct bio *bio, int err) * beyond the tolerance of the btrfs bio */ if (atomic_read(&bbio->error) > bbio->max_errors) { - err = -EIO; + bio->bi_error = -EIO; } else { /* * this bio is actually up to date, we didn't * go over the max number of errors */ - set_bit(BIO_UPTODATE, &bio->bi_flags); - err = 0; + bio->bi_error = 0; } - btrfs_end_bbio(bbio, bio, err); + btrfs_end_bbio(bbio, bio); } else if (!is_orig_bio) { bio_put(bio); } @@ -5826,7 +5915,7 @@ static noinline void btrfs_schedule_bio(struct btrfs_root *root, struct btrfs_pending_bios *pending_bios; if (device->missing || !device->bdev) { - bio_endio(bio, -EIO); + bio_io_error(bio); return; } @@ -5871,34 +5960,6 @@ static noinline void btrfs_schedule_bio(struct btrfs_root *root, &device->work); } -static int bio_size_ok(struct block_device *bdev, struct bio *bio, - sector_t sector) -{ - struct bio_vec *prev; - struct request_queue *q = bdev_get_queue(bdev); - unsigned int max_sectors = queue_max_sectors(q); - struct bvec_merge_data bvm = { - .bi_bdev = bdev, - .bi_sector = sector, - .bi_rw = bio->bi_rw, - }; - - if (WARN_ON(bio->bi_vcnt == 0)) - return 1; - - prev = &bio->bi_io_vec[bio->bi_vcnt - 1]; - if (bio_sectors(bio) > max_sectors) - return 0; - - if (!q->merge_bvec_fn) - return 1; - - bvm.bi_size = bio->bi_iter.bi_size - prev->bv_len; - if (q->merge_bvec_fn(q, &bvm, prev) < prev->bv_len) - return 0; - return 1; -} - static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio, struct bio *bio, u64 physical, int dev_nr, int rw, int async) @@ -5932,38 +5993,6 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio, btrfsic_submit_bio(rw, bio); } -static int breakup_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio, - struct bio *first_bio, struct btrfs_device *dev, - int dev_nr, int rw, int async) -{ - struct bio_vec *bvec = first_bio->bi_io_vec; - struct bio *bio; - int nr_vecs = bio_get_nr_vecs(dev->bdev); - u64 physical = bbio->stripes[dev_nr].physical; - -again: - bio = btrfs_bio_alloc(dev->bdev, physical >> 9, nr_vecs, GFP_NOFS); - if (!bio) - return -ENOMEM; - - while (bvec <= (first_bio->bi_io_vec + first_bio->bi_vcnt - 1)) { - if (bio_add_page(bio, bvec->bv_page, bvec->bv_len, - bvec->bv_offset) < bvec->bv_len) { - u64 len = bio->bi_iter.bi_size; - - atomic_inc(&bbio->stripes_pending); - submit_stripe_bio(root, bbio, bio, physical, dev_nr, - rw, async); - physical += len; - goto again; - } - bvec++; - } - - submit_stripe_bio(root, bbio, bio, physical, dev_nr, rw, async); - return 0; -} - static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical) { atomic_inc(&bbio->error); @@ -5973,8 +6002,8 @@ static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical) btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; bio->bi_iter.bi_sector = logical >> 9; - - btrfs_end_bbio(bbio, bio, -EIO); + bio->bi_error = -EIO; + btrfs_end_bbio(bbio, bio); } } @@ -6036,18 +6065,6 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, continue; } - /* - * Check and see if we're ok with this bio based on it's size - * and offset with the given device. - */ - if (!bio_size_ok(dev->bdev, first_bio, - bbio->stripes[dev_nr].physical >> 9)) { - ret = breakup_stripe_bio(root, bbio, first_bio, dev, - dev_nr, rw, async_submit); - BUG_ON(ret); - continue; - } - if (dev_nr < total_devs - 1) { bio = btrfs_bio_clone(first_bio, GFP_NOFS); BUG_ON(!bio); /* -ENOMEM */ @@ -6671,8 +6688,8 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans, BUG_ON(!path); ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1); if (ret < 0) { - printk_in_rcu(KERN_WARNING "BTRFS: " - "error %d while searching for dev_stats item for device %s!\n", + btrfs_warn_in_rcu(dev_root->fs_info, + "error %d while searching for dev_stats item for device %s", ret, rcu_str_deref(device->name)); goto out; } @@ -6682,8 +6699,8 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans, /* need to delete old one and insert a new one */ ret = btrfs_del_item(trans, dev_root, path); if (ret != 0) { - printk_in_rcu(KERN_WARNING "BTRFS: " - "delete too small dev_stats item for device %s failed %d!\n", + btrfs_warn_in_rcu(dev_root->fs_info, + "delete too small dev_stats item for device %s failed %d", rcu_str_deref(device->name), ret); goto out; } @@ -6696,9 +6713,9 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans, ret = btrfs_insert_empty_item(trans, dev_root, path, &key, sizeof(*ptr)); if (ret < 0) { - printk_in_rcu(KERN_WARNING "BTRFS: " - "insert dev_stats item for device %s failed %d!\n", - rcu_str_deref(device->name), ret); + btrfs_warn_in_rcu(dev_root->fs_info, + "insert dev_stats item for device %s failed %d", + rcu_str_deref(device->name), ret); goto out; } } @@ -6752,8 +6769,8 @@ static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev) { if (!dev->dev_stats_valid) return; - printk_ratelimited_in_rcu(KERN_ERR "BTRFS: " - "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n", + btrfs_err_rl_in_rcu(dev->dev_root->fs_info, + "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u", rcu_str_deref(dev->name), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), @@ -6772,8 +6789,8 @@ static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev) if (i == BTRFS_DEV_STAT_VALUES_MAX) return; /* all values == 0, suppress message */ - printk_in_rcu(KERN_INFO "BTRFS: " - "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n", + btrfs_info_in_rcu(dev->dev_root->fs_info, + "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u", rcu_str_deref(dev->name), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), @@ -6817,22 +6834,34 @@ int btrfs_get_dev_stats(struct btrfs_root *root, return 0; } -int btrfs_scratch_superblock(struct btrfs_device *device) +void btrfs_scratch_superblocks(struct block_device *bdev, char *device_path) { struct buffer_head *bh; struct btrfs_super_block *disk_super; + int copy_num; - bh = btrfs_read_dev_super(device->bdev); - if (!bh) - return -EINVAL; - disk_super = (struct btrfs_super_block *)bh->b_data; + if (!bdev) + return; - memset(&disk_super->magic, 0, sizeof(disk_super->magic)); - set_buffer_dirty(bh); - sync_dirty_buffer(bh); - brelse(bh); + for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX; + copy_num++) { - return 0; + if (btrfs_read_dev_one_super(bdev, copy_num, &bh)) + continue; + + disk_super = (struct btrfs_super_block *)bh->b_data; + + memset(&disk_super->magic, 0, sizeof(disk_super->magic)); + set_buffer_dirty(bh); + sync_dirty_buffer(bh); + brelse(bh); + } + + /* Notify udev that device has changed */ + btrfs_kobject_uevent(bdev, KOBJ_CHANGE); + + /* Update ctime/mtime for device path for libblkid */ + update_dev_time(device_path); } /* @@ -6900,3 +6929,38 @@ void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info) fs_devices = fs_devices->seed; } } + +void btrfs_close_one_device(struct btrfs_device *device) +{ + struct btrfs_fs_devices *fs_devices = device->fs_devices; + struct btrfs_device *new_device; + struct rcu_string *name; + + if (device->bdev) + fs_devices->open_devices--; + + if (device->writeable && + device->devid != BTRFS_DEV_REPLACE_DEVID) { + list_del_init(&device->dev_alloc_list); + fs_devices->rw_devices--; + } + + if (device->missing) + fs_devices->missing_devices--; + + new_device = btrfs_alloc_device(NULL, &device->devid, + device->uuid); + BUG_ON(IS_ERR(new_device)); /* -ENOMEM */ + + /* Safe because we are under uuid_mutex */ + if (device->name) { + name = rcu_string_strdup(device->name->str, GFP_NOFS); + BUG_ON(!name); /* -ENOMEM */ + rcu_assign_pointer(new_device->name, name); + } + + list_replace_rcu(&device->dev_list, &new_device->dev_list); + new_device->fs_devices = device->fs_devices; + + call_rcu(&device->rcu, free_device); +} diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 95842a909e7f..ec5712372732 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -256,7 +256,7 @@ struct btrfs_fs_devices { struct btrfs_fs_info *fs_info; /* sysfs kobjects */ - struct kobject super_kobj; + struct kobject fsid_kobj; struct kobject *device_dir_kobj; struct completion kobj_unregister; }; @@ -334,10 +334,15 @@ struct btrfs_raid_attr { int dev_stripes; /* stripes per dev */ int devs_max; /* max devs to use */ int devs_min; /* min devs needed */ + int tolerated_failures; /* max tolerated fail devs */ int devs_increment; /* ndevs has to be a multiple of this */ int ncopies; /* how many copies to data has */ }; +extern const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES]; + +extern const u64 btrfs_raid_group[BTRFS_NR_RAID_TYPES]; + struct map_lookup { u64 type; int io_align; @@ -375,6 +380,20 @@ struct map_lookup { #define BTRFS_BALANCE_ARGS_DRANGE (1ULL << 3) #define BTRFS_BALANCE_ARGS_VRANGE (1ULL << 4) #define BTRFS_BALANCE_ARGS_LIMIT (1ULL << 5) +#define BTRFS_BALANCE_ARGS_LIMIT_RANGE (1ULL << 6) +#define BTRFS_BALANCE_ARGS_STRIPES_RANGE (1ULL << 7) +#define BTRFS_BALANCE_ARGS_USAGE_RANGE (1ULL << 8) + +#define BTRFS_BALANCE_ARGS_MASK \ + (BTRFS_BALANCE_ARGS_PROFILES | \ + BTRFS_BALANCE_ARGS_USAGE | \ + BTRFS_BALANCE_ARGS_DEVID | \ + BTRFS_BALANCE_ARGS_DRANGE | \ + BTRFS_BALANCE_ARGS_VRANGE | \ + BTRFS_BALANCE_ARGS_LIMIT | \ + BTRFS_BALANCE_ARGS_LIMIT_RANGE | \ + BTRFS_BALANCE_ARGS_STRIPES_RANGE | \ + BTRFS_BALANCE_ARGS_USAGE_RANGE) /* * Profile changing flags. When SOFT is set we won't relocate chunk if @@ -453,6 +472,9 @@ int btrfs_cancel_balance(struct btrfs_fs_info *fs_info); int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info); int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info); int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset); +int find_free_dev_extent_start(struct btrfs_transaction *transaction, + struct btrfs_device *device, u64 num_bytes, + u64 search_start, u64 *start, u64 *max_avail); int find_free_dev_extent(struct btrfs_trans_handle *trans, struct btrfs_device *device, u64 num_bytes, u64 *start, u64 *max_avail); @@ -471,7 +493,7 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, struct btrfs_device *tgtdev); void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info, struct btrfs_device *tgtdev); -int btrfs_scratch_superblock(struct btrfs_device *device); +void btrfs_scratch_superblocks(struct block_device *bdev, char *device_path); int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len, int mirror_num); unsigned long btrfs_full_stripe_len(struct btrfs_root *root, @@ -544,5 +566,6 @@ static inline void unlock_chunks(struct btrfs_root *root) struct list_head *btrfs_get_fs_uuids(void); void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info); void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info); +void btrfs_close_one_device(struct btrfs_device *device); #endif |