From a22285a6a32390195235171b89d157ed1a1fe932 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Sun, 16 May 2010 10:48:46 -0400 Subject: Btrfs: Integrate metadata reservation with start_transaction Besides simplify the code, this change makes sure all metadata reservation for normal metadata operations are released after committing transaction. Changes since V1: Add code that check if unlink and rmdir will free space. Add ENOSPC handling for clone ioctl. Signed-off-by: Yan Zheng Signed-off-by: Chris Mason --- fs/btrfs/file.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'fs/btrfs/file.c') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 29ff749ff4ca..41e09e24e295 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -126,8 +126,7 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans, end_of_last_block = start_pos + num_bytes - 1; err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block, NULL); - if (err) - return err; + BUG_ON(err); for (i = 0; i < num_pages; i++) { struct page *p = pages[i]; @@ -142,7 +141,7 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans, * at this time. */ } - return err; + return 0; } /* @@ -1008,7 +1007,7 @@ out_nolock: num_written = err; if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) { - trans = btrfs_start_transaction(root, 1); + trans = btrfs_start_transaction(root, 0); ret = btrfs_log_dentry_safe(trans, root, file->f_dentry); if (ret == 0) { @@ -1104,9 +1103,9 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync) if (file && file->private_data) btrfs_ioctl_trans_end(file); - trans = btrfs_start_transaction(root, 1); - if (!trans) { - ret = -ENOMEM; + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); goto out; } -- cgit v1.2.3 From 0ca1f7ceb1991099ed5273885ebcf4323948c72e Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Sun, 16 May 2010 10:48:47 -0400 Subject: Btrfs: Update metadata reservation for delayed allocation Introduce metadata reservation context for delayed allocation and update various related functions. This patch also introduces EXTENT_FIRST_DELALLOC control bit for set/clear_extent_bit. It tells set/clear_bit_hook whether they are processing the first extent_state with EXTENT_DELALLOC bit set. This change is important if set/clear_extent_bit involves multiple extent_state. Signed-off-by: Yan Zheng Signed-off-by: Chris Mason --- fs/btrfs/btrfs_inode.h | 2 +- fs/btrfs/ctree.h | 19 +-- fs/btrfs/extent-tree.c | 361 ++++++++++++++++-------------------------------- fs/btrfs/extent_io.c | 63 ++++----- fs/btrfs/extent_io.h | 8 +- fs/btrfs/file.c | 27 +--- fs/btrfs/inode.c | 131 +++++++----------- fs/btrfs/ioctl.c | 29 ++-- fs/btrfs/ordered-data.c | 7 - 9 files changed, 232 insertions(+), 415 deletions(-) (limited to 'fs/btrfs/file.c') diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 7a4dee199832..40510d9351f5 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -137,8 +137,8 @@ struct btrfs_inode { * of extent items we've reserved metadata for. */ spinlock_t accounting_lock; + atomic_t outstanding_extents; int reserved_extents; - int outstanding_extents; /* * ordered_data_close is set by truncate when a file that used diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index e0aa9fb563e2..d4744192eada 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2079,19 +2079,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags); void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde); void btrfs_clear_space_info_full(struct btrfs_fs_info *info); - -int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root, - struct inode *inode, int num_items); -int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root, - struct inode *inode, int num_items); -int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode, - u64 bytes); -void btrfs_free_reserved_data_space(struct btrfs_root *root, - struct inode *inode, u64 bytes); -void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode, - u64 bytes); -void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode, - u64 bytes); +int btrfs_check_data_free_space(struct inode *inode, u64 bytes); +void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes); int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans, struct btrfs_root *root, int num_items, int *retries); @@ -2099,6 +2088,10 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans, struct btrfs_pending_snapshot *pending); +int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes); +void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes); +int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes); +void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes); void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv); struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root); void btrfs_free_block_rsv(struct btrfs_root *root, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 657df6e002d3..b1822e752b4a 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -64,12 +64,6 @@ static int find_next_key(struct btrfs_path *path, int level, struct btrfs_key *key); static void dump_space_info(struct btrfs_space_info *info, u64 bytes, int dump_block_groups); -static int maybe_allocate_chunk(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_space_info *sinfo, u64 num_bytes); -static int shrink_delalloc(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_space_info *sinfo, u64 to_reclaim); static noinline int block_group_cache_done(struct btrfs_block_group_cache *cache) @@ -2880,189 +2874,14 @@ void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode) BTRFS_BLOCK_GROUP_DATA); } -static u64 calculate_bytes_needed(struct btrfs_root *root, int num_items) -{ - u64 num_bytes; - int level; - - level = BTRFS_MAX_LEVEL - 2; - /* - * NOTE: these calculations are absolutely the worst possible case. - * This assumes that _every_ item we insert will require a new leaf, and - * that the tree has grown to its maximum level size. - */ - - /* - * for every item we insert we could insert both an extent item and a - * extent ref item. Then for ever item we insert, we will need to cow - * both the original leaf, plus the leaf to the left and right of it. - * - * Unless we are talking about the extent root, then we just want the - * number of items * 2, since we just need the extent item plus its ref. - */ - if (root == root->fs_info->extent_root) - num_bytes = num_items * 2; - else - num_bytes = (num_items + (2 * num_items)) * 3; - - /* - * num_bytes is total number of leaves we could need times the leaf - * size, and then for every leaf we could end up cow'ing 2 nodes per - * level, down to the leaf level. - */ - num_bytes = (num_bytes * root->leafsize) + - (num_bytes * (level * 2)) * root->nodesize; - - return num_bytes; -} - -/* - * Unreserve metadata space for delalloc. If we have less reserved credits than - * we have extents, this function does nothing. - */ -int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root, - struct inode *inode, int num_items) -{ - struct btrfs_fs_info *info = root->fs_info; - struct btrfs_space_info *meta_sinfo; - u64 num_bytes; - u64 alloc_target; - bool bug = false; - - /* get the space info for where the metadata will live */ - alloc_target = btrfs_get_alloc_profile(root, 0); - meta_sinfo = __find_space_info(info, alloc_target); - - num_bytes = calculate_bytes_needed(root->fs_info->extent_root, - num_items); - - spin_lock(&meta_sinfo->lock); - spin_lock(&BTRFS_I(inode)->accounting_lock); - if (BTRFS_I(inode)->reserved_extents <= - BTRFS_I(inode)->outstanding_extents) { - spin_unlock(&BTRFS_I(inode)->accounting_lock); - spin_unlock(&meta_sinfo->lock); - return 0; - } - spin_unlock(&BTRFS_I(inode)->accounting_lock); - - BTRFS_I(inode)->reserved_extents -= num_items; - BUG_ON(BTRFS_I(inode)->reserved_extents < 0); - - if (meta_sinfo->bytes_delalloc < num_bytes) { - bug = true; - meta_sinfo->bytes_delalloc = 0; - } else { - meta_sinfo->bytes_delalloc -= num_bytes; - } - spin_unlock(&meta_sinfo->lock); - - BUG_ON(bug); - - return 0; -} - -static void check_force_delalloc(struct btrfs_space_info *meta_sinfo) -{ - u64 thresh; - - thresh = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved + - meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly + - meta_sinfo->bytes_super + meta_sinfo->bytes_root + - meta_sinfo->bytes_may_use; - - thresh = meta_sinfo->total_bytes - thresh; - thresh *= 80; - do_div(thresh, 100); - if (thresh <= meta_sinfo->bytes_delalloc) - meta_sinfo->force_delalloc = 1; - else - meta_sinfo->force_delalloc = 0; -} - -/* - * Reserve metadata space for delalloc. - */ -int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root, - struct inode *inode, int num_items) -{ - struct btrfs_fs_info *info = root->fs_info; - struct btrfs_space_info *meta_sinfo; - u64 num_bytes; - u64 used; - u64 alloc_target; - int flushed = 0; - int force_delalloc; - - /* get the space info for where the metadata will live */ - alloc_target = btrfs_get_alloc_profile(root, 0); - meta_sinfo = __find_space_info(info, alloc_target); - - num_bytes = calculate_bytes_needed(root->fs_info->extent_root, - num_items); -again: - spin_lock(&meta_sinfo->lock); - - force_delalloc = meta_sinfo->force_delalloc; - - if (unlikely(!meta_sinfo->bytes_root)) - meta_sinfo->bytes_root = calculate_bytes_needed(root, 6); - - if (!flushed) - meta_sinfo->bytes_delalloc += num_bytes; - - used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved + - meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly + - meta_sinfo->bytes_super + meta_sinfo->bytes_root + - meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc; - - if (used > meta_sinfo->total_bytes) { - flushed++; - - if (flushed == 1) { - if (maybe_allocate_chunk(NULL, root, meta_sinfo, - num_bytes)) - goto again; - flushed++; - } else { - spin_unlock(&meta_sinfo->lock); - } - - if (flushed == 2) { - filemap_flush(inode->i_mapping); - goto again; - } else if (flushed == 3) { - shrink_delalloc(NULL, root, meta_sinfo, num_bytes); - goto again; - } - spin_lock(&meta_sinfo->lock); - meta_sinfo->bytes_delalloc -= num_bytes; - spin_unlock(&meta_sinfo->lock); - printk(KERN_ERR "enospc, has %d, reserved %d\n", - BTRFS_I(inode)->outstanding_extents, - BTRFS_I(inode)->reserved_extents); - dump_space_info(meta_sinfo, 0, 0); - return -ENOSPC; - } - - BTRFS_I(inode)->reserved_extents += num_items; - check_force_delalloc(meta_sinfo); - spin_unlock(&meta_sinfo->lock); - - if (!flushed && force_delalloc) - filemap_flush(inode->i_mapping); - - return 0; -} - /* * This will check the space that the inode allocates from to make sure we have * enough space for bytes. */ -int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode, - u64 bytes) +int btrfs_check_data_free_space(struct inode *inode, u64 bytes) { struct btrfs_space_info *data_sinfo; + struct btrfs_root *root = BTRFS_I(inode)->root; u64 used; int ret = 0, committed = 0; @@ -3147,12 +2966,13 @@ alloc: } /* - * if there was an error for whatever reason after calling - * btrfs_check_data_free_space, call this so we can cleanup the counters. + * called when we are clearing an delalloc extent from the + * inode's io_tree or there was an error for whatever reason + * after calling btrfs_check_data_free_space */ -void btrfs_free_reserved_data_space(struct btrfs_root *root, - struct inode *inode, u64 bytes) +void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes) { + struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_space_info *data_sinfo; /* make sure bytes are sectorsize aligned */ @@ -3165,48 +2985,6 @@ void btrfs_free_reserved_data_space(struct btrfs_root *root, spin_unlock(&data_sinfo->lock); } -/* called when we are adding a delalloc extent to the inode's io_tree */ -void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode, - u64 bytes) -{ - struct btrfs_space_info *data_sinfo; - - /* get the space info for where this inode will be storing its data */ - data_sinfo = BTRFS_I(inode)->space_info; - - /* make sure we have enough space to handle the data first */ - spin_lock(&data_sinfo->lock); - data_sinfo->bytes_delalloc += bytes; - - /* - * we are adding a delalloc extent without calling - * btrfs_check_data_free_space first. This happens on a weird - * writepage condition, but shouldn't hurt our accounting - */ - if (unlikely(bytes > BTRFS_I(inode)->reserved_bytes)) { - data_sinfo->bytes_may_use -= BTRFS_I(inode)->reserved_bytes; - BTRFS_I(inode)->reserved_bytes = 0; - } else { - data_sinfo->bytes_may_use -= bytes; - BTRFS_I(inode)->reserved_bytes -= bytes; - } - - spin_unlock(&data_sinfo->lock); -} - -/* called when we are clearing an delalloc extent from the inode's io_tree */ -void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode, - u64 bytes) -{ - struct btrfs_space_info *info; - - info = BTRFS_I(inode)->space_info; - - spin_lock(&info->lock); - info->bytes_delalloc -= bytes; - spin_unlock(&info->lock); -} - static void force_metadata_allocation(struct btrfs_fs_info *info) { struct list_head *head = &info->space_info; @@ -3331,18 +3109,19 @@ static int maybe_allocate_chunk(struct btrfs_trans_handle *trans, * shrink metadata reservation for delalloc */ static int shrink_delalloc(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_space_info *sinfo, u64 to_reclaim) + struct btrfs_root *root, u64 to_reclaim) { + struct btrfs_block_rsv *block_rsv; u64 reserved; u64 max_reclaim; u64 reclaimed = 0; int pause = 1; int ret; - spin_lock(&sinfo->lock); - reserved = sinfo->bytes_delalloc; - spin_unlock(&sinfo->lock); + block_rsv = &root->fs_info->delalloc_block_rsv; + spin_lock(&block_rsv->lock); + reserved = block_rsv->reserved; + spin_unlock(&block_rsv->lock); if (reserved == 0) return 0; @@ -3361,11 +3140,11 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, pause = 1; } - spin_lock(&sinfo->lock); - if (reserved > sinfo->bytes_delalloc) - reclaimed = reserved - sinfo->bytes_delalloc; - reserved = sinfo->bytes_delalloc; - spin_unlock(&sinfo->lock); + spin_lock(&block_rsv->lock); + if (reserved > block_rsv->reserved) + reclaimed = reserved - block_rsv->reserved; + reserved = block_rsv->reserved; + spin_unlock(&block_rsv->lock); if (reserved == 0 || reclaimed >= max_reclaim) break; @@ -3394,7 +3173,7 @@ static int should_retry_reserve(struct btrfs_trans_handle *trans, if (trans && trans->transaction->in_commit) return -ENOSPC; - ret = shrink_delalloc(trans, root, space_info, num_bytes); + ret = shrink_delalloc(trans, root, num_bytes); if (ret) return ret; @@ -3754,6 +3533,108 @@ int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans, return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); } +static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes) +{ + return num_bytes >>= 3; +} + +int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv; + u64 to_reserve; + int nr_extents; + int retries = 0; + int ret; + + if (btrfs_transaction_in_commit(root->fs_info)) + schedule_timeout(1); + + num_bytes = ALIGN(num_bytes, root->sectorsize); +again: + spin_lock(&BTRFS_I(inode)->accounting_lock); + nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents) + 1; + if (nr_extents > BTRFS_I(inode)->reserved_extents) { + nr_extents -= BTRFS_I(inode)->reserved_extents; + to_reserve = calc_trans_metadata_size(root, nr_extents); + } else { + nr_extents = 0; + to_reserve = 0; + } + + to_reserve += calc_csum_metadata_size(inode, num_bytes); + ret = reserve_metadata_bytes(block_rsv, to_reserve); + if (ret) { + spin_unlock(&BTRFS_I(inode)->accounting_lock); + ret = should_retry_reserve(NULL, root, block_rsv, to_reserve, + &retries); + if (ret > 0) + goto again; + return ret; + } + + BTRFS_I(inode)->reserved_extents += nr_extents; + atomic_inc(&BTRFS_I(inode)->outstanding_extents); + spin_unlock(&BTRFS_I(inode)->accounting_lock); + + block_rsv_add_bytes(block_rsv, to_reserve, 1); + + if (block_rsv->size > 512 * 1024 * 1024) + shrink_delalloc(NULL, root, to_reserve); + + return 0; +} + +void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + u64 to_free; + int nr_extents; + + num_bytes = ALIGN(num_bytes, root->sectorsize); + atomic_dec(&BTRFS_I(inode)->outstanding_extents); + + spin_lock(&BTRFS_I(inode)->accounting_lock); + nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents); + if (nr_extents < BTRFS_I(inode)->reserved_extents) { + nr_extents = BTRFS_I(inode)->reserved_extents - nr_extents; + BTRFS_I(inode)->reserved_extents -= nr_extents; + } else { + nr_extents = 0; + } + spin_unlock(&BTRFS_I(inode)->accounting_lock); + + to_free = calc_csum_metadata_size(inode, num_bytes); + if (nr_extents > 0) + to_free += calc_trans_metadata_size(root, nr_extents); + + btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv, + to_free); +} + +int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes) +{ + int ret; + + ret = btrfs_check_data_free_space(inode, num_bytes); + if (ret) + return ret; + + ret = btrfs_delalloc_reserve_metadata(inode, num_bytes); + if (ret) { + btrfs_free_reserved_data_space(inode, num_bytes); + return ret; + } + + return 0; +} + +void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes) +{ + btrfs_delalloc_release_metadata(inode, num_bytes); + btrfs_free_reserved_data_space(inode, num_bytes); +} + static int update_block_group(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, int alloc) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index d2d03684fab2..1a57c17d4029 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -335,21 +335,18 @@ static int merge_state(struct extent_io_tree *tree, } static int set_state_cb(struct extent_io_tree *tree, - struct extent_state *state, - unsigned long bits) + struct extent_state *state, int *bits) { if (tree->ops && tree->ops->set_bit_hook) { return tree->ops->set_bit_hook(tree->mapping->host, - state->start, state->end, - state->state, bits); + state, bits); } return 0; } static void clear_state_cb(struct extent_io_tree *tree, - struct extent_state *state, - unsigned long bits) + struct extent_state *state, int *bits) { if (tree->ops && tree->ops->clear_bit_hook) tree->ops->clear_bit_hook(tree->mapping->host, state, bits); @@ -367,9 +364,10 @@ static void clear_state_cb(struct extent_io_tree *tree, */ static int insert_state(struct extent_io_tree *tree, struct extent_state *state, u64 start, u64 end, - int bits) + int *bits) { struct rb_node *node; + int bits_to_set = *bits & ~EXTENT_CTLBITS; int ret; if (end < start) { @@ -384,9 +382,9 @@ static int insert_state(struct extent_io_tree *tree, if (ret) return ret; - if (bits & EXTENT_DIRTY) + if (bits_to_set & EXTENT_DIRTY) tree->dirty_bytes += end - start + 1; - state->state |= bits; + state->state |= bits_to_set; node = tree_insert(&tree->state, end, &state->rb_node); if (node) { struct extent_state *found; @@ -456,13 +454,13 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig, * struct is freed and removed from the tree */ static int clear_state_bit(struct extent_io_tree *tree, - struct extent_state *state, int bits, int wake, - int delete) + struct extent_state *state, + int *bits, int wake) { - int bits_to_clear = bits & ~EXTENT_DO_ACCOUNTING; + int bits_to_clear = *bits & ~EXTENT_CTLBITS; int ret = state->state & bits_to_clear; - if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { + if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { u64 range = state->end - state->start + 1; WARN_ON(range > tree->dirty_bytes); tree->dirty_bytes -= range; @@ -471,9 +469,8 @@ static int clear_state_bit(struct extent_io_tree *tree, state->state &= ~bits_to_clear; if (wake) wake_up(&state->wq); - if (delete || state->state == 0) { + if (state->state == 0) { if (state->tree) { - clear_state_cb(tree, state, state->state); rb_erase(&state->rb_node, &tree->state); state->tree = NULL; free_extent_state(state); @@ -514,6 +511,10 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int set = 0; int clear = 0; + if (delete) + bits |= ~EXTENT_CTLBITS; + bits |= EXTENT_FIRST_DELALLOC; + if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY)) clear = 1; again: @@ -580,8 +581,7 @@ hit_next: if (err) goto out; if (state->end <= end) { - set |= clear_state_bit(tree, state, bits, wake, - delete); + set |= clear_state_bit(tree, state, &bits, wake); if (last_end == (u64)-1) goto out; start = last_end + 1; @@ -602,7 +602,7 @@ hit_next: if (wake) wake_up(&state->wq); - set |= clear_state_bit(tree, prealloc, bits, wake, delete); + set |= clear_state_bit(tree, prealloc, &bits, wake); prealloc = NULL; goto out; @@ -613,7 +613,7 @@ hit_next: else next_node = NULL; - set |= clear_state_bit(tree, state, bits, wake, delete); + set |= clear_state_bit(tree, state, &bits, wake); if (last_end == (u64)-1) goto out; start = last_end + 1; @@ -706,19 +706,19 @@ out: static int set_state_bits(struct extent_io_tree *tree, struct extent_state *state, - int bits) + int *bits) { int ret; + int bits_to_set = *bits & ~EXTENT_CTLBITS; ret = set_state_cb(tree, state, bits); if (ret) return ret; - - if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { + if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { u64 range = state->end - state->start + 1; tree->dirty_bytes += range; } - state->state |= bits; + state->state |= bits_to_set; return 0; } @@ -757,6 +757,7 @@ static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u64 last_start; u64 last_end; + bits |= EXTENT_FIRST_DELALLOC; again: if (!prealloc && (mask & __GFP_WAIT)) { prealloc = alloc_extent_state(mask); @@ -778,7 +779,7 @@ again: */ node = tree_search(tree, start); if (!node) { - err = insert_state(tree, prealloc, start, end, bits); + err = insert_state(tree, prealloc, start, end, &bits); prealloc = NULL; BUG_ON(err == -EEXIST); goto out; @@ -802,7 +803,7 @@ hit_next: goto out; } - err = set_state_bits(tree, state, bits); + err = set_state_bits(tree, state, &bits); if (err) goto out; @@ -852,7 +853,7 @@ hit_next: if (err) goto out; if (state->end <= end) { - err = set_state_bits(tree, state, bits); + err = set_state_bits(tree, state, &bits); if (err) goto out; cache_state(state, cached_state); @@ -877,7 +878,7 @@ hit_next: else this_end = last_start - 1; err = insert_state(tree, prealloc, start, this_end, - bits); + &bits); BUG_ON(err == -EEXIST); if (err) { prealloc = NULL; @@ -903,7 +904,7 @@ hit_next: err = split_state(tree, state, prealloc, end + 1); BUG_ON(err == -EEXIST); - err = set_state_bits(tree, prealloc, bits); + err = set_state_bits(tree, prealloc, &bits); if (err) { prealloc = NULL; goto out; @@ -966,8 +967,7 @@ int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, { return clear_extent_bit(tree, start, end, EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING, 0, 0, - NULL, mask); + EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask); } int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, @@ -1435,9 +1435,6 @@ int extent_clear_unlock_delalloc(struct inode *inode, if (op & EXTENT_CLEAR_DELALLOC) clear_bits |= EXTENT_DELALLOC; - if (op & EXTENT_CLEAR_ACCOUNTING) - clear_bits |= EXTENT_DO_ACCOUNTING; - clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS); if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY | EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK | diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index bbab4813646f..86f10dc791d9 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -16,7 +16,9 @@ #define EXTENT_BOUNDARY (1 << 9) #define EXTENT_NODATASUM (1 << 10) #define EXTENT_DO_ACCOUNTING (1 << 11) +#define EXTENT_FIRST_DELALLOC (1 << 12) #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) +#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC) /* flags for bio submission */ #define EXTENT_BIO_COMPRESSED 1 @@ -69,10 +71,10 @@ struct extent_io_ops { struct extent_state *state); int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end, struct extent_state *state, int uptodate); - int (*set_bit_hook)(struct inode *inode, u64 start, u64 end, - unsigned long old, unsigned long bits); + int (*set_bit_hook)(struct inode *inode, struct extent_state *state, + int *bits); int (*clear_bit_hook)(struct inode *inode, struct extent_state *state, - unsigned long bits); + int *bits); int (*merge_extent_hook)(struct inode *inode, struct extent_state *new, struct extent_state *other); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 41e09e24e295..6d8f817eadb5 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -852,13 +852,6 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf, vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); - /* do the reserve before the mutex lock in case we have to do some - * flushing. We wouldn't deadlock, but this is more polite. - */ - err = btrfs_reserve_metadata_for_delalloc(root, inode, 1); - if (err) - goto out_nolock; - mutex_lock(&inode->i_mutex); current->backing_dev_info = inode->i_mapping->backing_dev_info; @@ -921,7 +914,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf, WARN_ON(num_pages > nrptrs); memset(pages, 0, sizeof(struct page *) * nrptrs); - ret = btrfs_check_data_free_space(root, inode, write_bytes); + ret = btrfs_delalloc_reserve_space(inode, write_bytes); if (ret) goto out; @@ -929,26 +922,20 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf, pos, first_index, last_index, write_bytes); if (ret) { - btrfs_free_reserved_data_space(root, inode, - write_bytes); + btrfs_delalloc_release_space(inode, write_bytes); goto out; } ret = btrfs_copy_from_user(pos, num_pages, write_bytes, pages, buf); - if (ret) { - btrfs_free_reserved_data_space(root, inode, - write_bytes); - btrfs_drop_pages(pages, num_pages); - goto out; + if (ret == 0) { + dirty_and_release_pages(NULL, root, file, pages, + num_pages, pos, write_bytes); } - ret = dirty_and_release_pages(NULL, root, file, pages, - num_pages, pos, write_bytes); btrfs_drop_pages(pages, num_pages); if (ret) { - btrfs_free_reserved_data_space(root, inode, - write_bytes); + btrfs_delalloc_release_space(inode, write_bytes); goto out; } @@ -975,9 +962,7 @@ out: mutex_unlock(&inode->i_mutex); if (ret) err = ret; - btrfs_unreserve_metadata_for_delalloc(root, inode, 1); -out_nolock: kfree(pages); if (pinned[0]) page_cache_release(pinned[0]); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index c4b0fd12df68..6e54665d37f7 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -252,6 +252,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans, inline_len, compressed_size, compressed_pages); BUG_ON(ret); + btrfs_delalloc_release_metadata(inode, end + 1 - start); btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0); return 0; } @@ -414,6 +415,7 @@ again: trans = btrfs_join_transaction(root, 1); BUG_ON(!trans); btrfs_set_trans_block_group(trans, inode); + trans->block_rsv = &root->fs_info->delalloc_block_rsv; /* lets try to make an inline extent */ if (ret || total_in < (actual_end - start)) { @@ -439,7 +441,6 @@ again: start, end, NULL, EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY | EXTENT_CLEAR_DELALLOC | - EXTENT_CLEAR_ACCOUNTING | EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK); btrfs_end_transaction(trans, root); @@ -734,6 +735,7 @@ static noinline int cow_file_range(struct inode *inode, trans = btrfs_join_transaction(root, 1); BUG_ON(!trans); btrfs_set_trans_block_group(trans, inode); + trans->block_rsv = &root->fs_info->delalloc_block_rsv; actual_end = min_t(u64, isize, end + 1); @@ -753,7 +755,6 @@ static noinline int cow_file_range(struct inode *inode, EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC | - EXTENT_CLEAR_ACCOUNTING | EXTENT_CLEAR_DIRTY | EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK); @@ -1226,15 +1227,13 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page, } static int btrfs_split_extent_hook(struct inode *inode, - struct extent_state *orig, u64 split) + struct extent_state *orig, u64 split) { + /* not delalloc, ignore it */ if (!(orig->state & EXTENT_DELALLOC)) return 0; - spin_lock(&BTRFS_I(inode)->accounting_lock); - BTRFS_I(inode)->outstanding_extents++; - spin_unlock(&BTRFS_I(inode)->accounting_lock); - + atomic_inc(&BTRFS_I(inode)->outstanding_extents); return 0; } @@ -1252,10 +1251,7 @@ static int btrfs_merge_extent_hook(struct inode *inode, if (!(other->state & EXTENT_DELALLOC)) return 0; - spin_lock(&BTRFS_I(inode)->accounting_lock); - BTRFS_I(inode)->outstanding_extents--; - spin_unlock(&BTRFS_I(inode)->accounting_lock); - + atomic_dec(&BTRFS_I(inode)->outstanding_extents); return 0; } @@ -1264,8 +1260,8 @@ static int btrfs_merge_extent_hook(struct inode *inode, * bytes in this file, and to maintain the list of inodes that * have pending delalloc work to be done. */ -static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end, - unsigned long old, unsigned long bits) +static int btrfs_set_bit_hook(struct inode *inode, + struct extent_state *state, int *bits) { /* @@ -1273,17 +1269,18 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end, * but in this case, we are only testeing for the DELALLOC * bit, which is only set or cleared with irqs on */ - if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { + if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { struct btrfs_root *root = BTRFS_I(inode)->root; + u64 len = state->end + 1 - state->start; - spin_lock(&BTRFS_I(inode)->accounting_lock); - BTRFS_I(inode)->outstanding_extents++; - spin_unlock(&BTRFS_I(inode)->accounting_lock); - btrfs_delalloc_reserve_space(root, inode, end - start + 1); + if (*bits & EXTENT_FIRST_DELALLOC) + *bits &= ~EXTENT_FIRST_DELALLOC; + else + atomic_inc(&BTRFS_I(inode)->outstanding_extents); spin_lock(&root->fs_info->delalloc_lock); - BTRFS_I(inode)->delalloc_bytes += end - start + 1; - root->fs_info->delalloc_bytes += end - start + 1; + BTRFS_I(inode)->delalloc_bytes += len; + root->fs_info->delalloc_bytes += len; if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) { list_add_tail(&BTRFS_I(inode)->delalloc_inodes, &root->fs_info->delalloc_inodes); @@ -1297,45 +1294,32 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end, * extent_io.c clear_bit_hook, see set_bit_hook for why */ static int btrfs_clear_bit_hook(struct inode *inode, - struct extent_state *state, unsigned long bits) + struct extent_state *state, int *bits) { /* * set_bit and clear bit hooks normally require _irqsave/restore * but in this case, we are only testeing for the DELALLOC * bit, which is only set or cleared with irqs on */ - if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { + if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { struct btrfs_root *root = BTRFS_I(inode)->root; + u64 len = state->end + 1 - state->start; - if (bits & EXTENT_DO_ACCOUNTING) { - spin_lock(&BTRFS_I(inode)->accounting_lock); - WARN_ON(!BTRFS_I(inode)->outstanding_extents); - BTRFS_I(inode)->outstanding_extents--; - spin_unlock(&BTRFS_I(inode)->accounting_lock); - btrfs_unreserve_metadata_for_delalloc(root, inode, 1); - } + if (*bits & EXTENT_FIRST_DELALLOC) + *bits &= ~EXTENT_FIRST_DELALLOC; + else if (!(*bits & EXTENT_DO_ACCOUNTING)) + atomic_dec(&BTRFS_I(inode)->outstanding_extents); + + if (*bits & EXTENT_DO_ACCOUNTING) + btrfs_delalloc_release_metadata(inode, len); + + if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) + btrfs_free_reserved_data_space(inode, len); spin_lock(&root->fs_info->delalloc_lock); - if (state->end - state->start + 1 > - root->fs_info->delalloc_bytes) { - printk(KERN_INFO "btrfs warning: delalloc account " - "%llu %llu\n", - (unsigned long long) - state->end - state->start + 1, - (unsigned long long) - root->fs_info->delalloc_bytes); - btrfs_delalloc_free_space(root, inode, (u64)-1); - root->fs_info->delalloc_bytes = 0; - BTRFS_I(inode)->delalloc_bytes = 0; - } else { - btrfs_delalloc_free_space(root, inode, - state->end - - state->start + 1); - root->fs_info->delalloc_bytes -= state->end - - state->start + 1; - BTRFS_I(inode)->delalloc_bytes -= state->end - - state->start + 1; - } + root->fs_info->delalloc_bytes -= len; + BTRFS_I(inode)->delalloc_bytes -= len; + if (BTRFS_I(inode)->delalloc_bytes == 0 && !list_empty(&BTRFS_I(inode)->delalloc_inodes)) { list_del_init(&BTRFS_I(inode)->delalloc_inodes); @@ -1520,6 +1504,7 @@ again: goto again; } + BUG(); btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state); ClearPageChecked(page); out: @@ -1650,7 +1635,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) { struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_trans_handle *trans; + struct btrfs_trans_handle *trans = NULL; struct btrfs_ordered_extent *ordered_extent = NULL; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct extent_state *cached_state = NULL; @@ -1668,9 +1653,10 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); if (!ret) { trans = btrfs_join_transaction(root, 1); + btrfs_set_trans_block_group(trans, inode); + trans->block_rsv = &root->fs_info->delalloc_block_rsv; ret = btrfs_update_inode(trans, root, inode); BUG_ON(ret); - btrfs_end_transaction(trans, root); } goto out; } @@ -1680,6 +1666,8 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) 0, &cached_state, GFP_NOFS); trans = btrfs_join_transaction(root, 1); + btrfs_set_trans_block_group(trans, inode); + trans->block_rsv = &root->fs_info->delalloc_block_rsv; if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) compressed = 1; @@ -1711,12 +1699,13 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) add_pending_csums(trans, inode, ordered_extent->file_offset, &ordered_extent->list); - /* this also removes the ordered extent from the tree */ btrfs_ordered_update_i_size(inode, 0, ordered_extent); ret = btrfs_update_inode(trans, root, inode); BUG_ON(ret); - btrfs_end_transaction(trans, root); out: + btrfs_delalloc_release_metadata(inode, ordered_extent->len); + if (trans) + btrfs_end_transaction(trans, root); /* once for us */ btrfs_put_ordered_extent(ordered_extent); /* once for the tree */ @@ -3219,11 +3208,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from) if ((offset & (blocksize - 1)) == 0) goto out; - ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE); - if (ret) - goto out; - - ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1); + ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); if (ret) goto out; @@ -3231,8 +3216,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from) again: page = grab_cache_page(mapping, index); if (!page) { - btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); - btrfs_unreserve_metadata_for_delalloc(root, inode, 1); + btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); goto out; } @@ -3295,8 +3279,7 @@ again: out_unlock: if (ret) - btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); - btrfs_unreserve_metadata_for_delalloc(root, inode, 1); + btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); unlock_page(page); page_cache_release(page); out: @@ -4878,6 +4861,7 @@ again: } flush_dcache_page(page); } else if (create && PageUptodate(page)) { + WARN_ON(1); if (!trans) { kunmap(page); free_extent_map(em); @@ -5142,7 +5126,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) u64 page_start; u64 page_end; - ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE); + ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); if (ret) { if (ret == -ENOMEM) ret = VM_FAULT_OOM; @@ -5151,13 +5135,6 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) goto out; } - ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1); - if (ret) { - btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); - ret = VM_FAULT_SIGBUS; - goto out; - } - ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */ again: lock_page(page); @@ -5167,7 +5144,6 @@ again: if ((page->mapping != inode->i_mapping) || (page_start >= size)) { - btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); /* page got truncated out from underneath us */ goto out_unlock; } @@ -5208,7 +5184,6 @@ again: unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS); ret = VM_FAULT_SIGBUS; - btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); goto out_unlock; } ret = 0; @@ -5235,10 +5210,10 @@ again: unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS); out_unlock: - btrfs_unreserve_metadata_for_delalloc(root, inode, 1); if (!ret) return VM_FAULT_LOCKED; unlock_page(page); + btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); out: return ret; } @@ -5383,7 +5358,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->last_unlink_trans = 0; spin_lock_init(&ei->accounting_lock); - ei->outstanding_extents = 0; + atomic_set(&ei->outstanding_extents, 0); ei->reserved_extents = 0; ei->ordered_data_close = 0; @@ -5411,6 +5386,8 @@ void btrfs_destroy_inode(struct inode *inode) WARN_ON(!list_empty(&inode->i_dentry)); WARN_ON(inode->i_data.nrpages); + WARN_ON(atomic_read(&BTRFS_I(inode)->outstanding_extents)); + WARN_ON(BTRFS_I(inode)->reserved_extents); /* * This can happen where we create an inode, but somebody else also @@ -5970,8 +5947,7 @@ static long btrfs_fallocate(struct inode *inode, int mode, goto out; } - ret = btrfs_check_data_free_space(BTRFS_I(inode)->root, inode, - alloc_end - alloc_start); + ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start); if (ret) goto out; @@ -6037,8 +6013,7 @@ static long btrfs_fallocate(struct inode *inode, int mode, unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, &cached_state, GFP_NOFS); - btrfs_free_reserved_data_space(BTRFS_I(inode)->root, inode, - alloc_end - alloc_start); + btrfs_free_reserved_data_space(inode, alloc_end - alloc_start); out: mutex_unlock(&inode->i_mutex); return ret; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 3066da468c6d..6a706e691377 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -587,19 +587,9 @@ static int btrfs_defrag_file(struct file *file, if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) BTRFS_I(inode)->force_compress = 1; - ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE); - if (ret) { - ret = -ENOSPC; - break; - } - - ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1); - if (ret) { - btrfs_free_reserved_data_space(root, inode, - PAGE_CACHE_SIZE); - ret = -ENOSPC; - break; - } + ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); + if (ret) + goto err_unlock; again: if (inode->i_size == 0 || i > ((inode->i_size - 1) >> PAGE_CACHE_SHIFT)) { @@ -608,8 +598,10 @@ again: } page = grab_cache_page(inode->i_mapping, i); - if (!page) + if (!page) { + ret = -ENOMEM; goto err_reservations; + } if (!PageUptodate(page)) { btrfs_readpage(NULL, page); @@ -617,6 +609,7 @@ again: if (!PageUptodate(page)) { unlock_page(page); page_cache_release(page); + ret = -EIO; goto err_reservations; } } @@ -630,8 +623,7 @@ again: wait_on_page_writeback(page); if (PageDirty(page)) { - btrfs_free_reserved_data_space(root, inode, - PAGE_CACHE_SIZE); + btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); goto loop_unlock; } @@ -669,7 +661,6 @@ loop_unlock: page_cache_release(page); mutex_unlock(&inode->i_mutex); - btrfs_unreserve_metadata_for_delalloc(root, inode, 1); balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1); i++; } @@ -699,9 +690,9 @@ loop_unlock: return 0; err_reservations: + btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); +err_unlock: mutex_unlock(&inode->i_mutex); - btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); - btrfs_unreserve_metadata_for_delalloc(root, inode, 1); return ret; } diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index a127c0ebb2dc..c9f1020572f2 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -311,13 +311,6 @@ static int __btrfs_remove_ordered_extent(struct inode *inode, tree->last = NULL; set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); - spin_lock(&BTRFS_I(inode)->accounting_lock); - WARN_ON(!BTRFS_I(inode)->outstanding_extents); - BTRFS_I(inode)->outstanding_extents--; - spin_unlock(&BTRFS_I(inode)->accounting_lock); - btrfs_unreserve_metadata_for_delalloc(BTRFS_I(inode)->root, - inode, 1); - spin_lock(&root->fs_info->ordered_extent_lock); list_del_init(&entry->root_extent_list); -- cgit v1.2.3 From 4b46fce23349bfca781a32e2707a18328ca5ae22 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Sun, 23 May 2010 11:00:55 -0400 Subject: Btrfs: add basic DIO read/write support This provides basic DIO support for reading and writing. It does not do the work to recover from mismatching checksums, that will come later. A few design changes have been made from Jim's code (sorry Jim!) 1) Use the generic direct-io code. Jim originally re-wrote all the generic DIO code in order to account for all of BTRFS's oddities, but thanks to that work it seems like the best bet is to just ignore compression and such and just opt to fallback on buffered IO. 2) Fallback on buffered IO for compressed or inline extents. Jim's code did it's own buffering to make dio with compressed extents work. Now we just fallback onto normal buffered IO. 3) Use ordered extents for the writes so that all of the lock_extent() lookup_ordered() type checks continue to work. 4) Do the lock_extent() lookup_ordered() loop in readpage so we don't race with DIO writes. I've tested this with fsx and everything works great. This patch depends on my dio and filemap.c patches to work. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 2 + fs/btrfs/file-item.c | 25 ++- fs/btrfs/file.c | 69 ++++++- fs/btrfs/inode.c | 487 +++++++++++++++++++++++++++++++++++++++++++++--- fs/btrfs/ordered-data.c | 75 +++++++- fs/btrfs/ordered-data.h | 9 +- 6 files changed, 631 insertions(+), 36 deletions(-) (limited to 'fs/btrfs/file.c') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 5ed0223d1cbe..e9bf86415e86 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2317,6 +2317,8 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 len); int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, struct bio *bio, u32 *dst); +int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode, + struct bio *bio, u64 logical_offset, u32 *dst); int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 objectid, u64 pos, diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 21aead39a76c..a562a250ae77 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -149,13 +149,14 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, } -int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, - struct bio *bio, u32 *dst) +static int __btrfs_lookup_bio_sums(struct btrfs_root *root, + struct inode *inode, struct bio *bio, + u64 logical_offset, u32 *dst, int dio) { u32 sum; struct bio_vec *bvec = bio->bi_io_vec; int bio_index = 0; - u64 offset; + u64 offset = 0; u64 item_start_offset = 0; u64 item_last_offset = 0; u64 disk_bytenr; @@ -174,8 +175,11 @@ int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, WARN_ON(bio->bi_vcnt <= 0); disk_bytenr = (u64)bio->bi_sector << 9; + if (dio) + offset = logical_offset; while (bio_index < bio->bi_vcnt) { - offset = page_offset(bvec->bv_page) + bvec->bv_offset; + if (!dio) + offset = page_offset(bvec->bv_page) + bvec->bv_offset; ret = btrfs_find_ordered_sum(inode, offset, disk_bytenr, &sum); if (ret == 0) goto found; @@ -238,6 +242,7 @@ found: else set_state_private(io_tree, offset, sum); disk_bytenr += bvec->bv_len; + offset += bvec->bv_len; bio_index++; bvec++; } @@ -245,6 +250,18 @@ found: return 0; } +int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, + struct bio *bio, u32 *dst) +{ + return __btrfs_lookup_bio_sums(root, inode, bio, 0, dst, 0); +} + +int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode, + struct bio *bio, u64 offset, u32 *dst) +{ + return __btrfs_lookup_bio_sums(root, inode, bio, offset, dst, 1); +} + int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, struct list_head *list) { diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 6d8f817eadb5..a28810abfb98 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -822,6 +822,47 @@ again: return 0; } +/* Copied from read-write.c */ +static void wait_on_retry_sync_kiocb(struct kiocb *iocb) +{ + set_current_state(TASK_UNINTERRUPTIBLE); + if (!kiocbIsKicked(iocb)) + schedule(); + else + kiocbClearKicked(iocb); + __set_current_state(TASK_RUNNING); +} + +/* + * Just a copy of what do_sync_write does. + */ +static ssize_t __btrfs_direct_write(struct file *file, const char __user *buf, + size_t count, loff_t pos, loff_t *ppos) +{ + struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = count }; + unsigned long nr_segs = 1; + struct kiocb kiocb; + ssize_t ret; + + init_sync_kiocb(&kiocb, file); + kiocb.ki_pos = pos; + kiocb.ki_left = count; + kiocb.ki_nbytes = count; + + while (1) { + ret = generic_file_direct_write(&kiocb, &iov, &nr_segs, pos, + ppos, count, count); + if (ret != -EIOCBRETRY) + break; + wait_on_retry_sync_kiocb(&kiocb); + } + + if (ret == -EIOCBQUEUED) + ret = wait_on_sync_kiocb(&kiocb); + *ppos = kiocb.ki_pos; + return ret; +} + static ssize_t btrfs_file_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { @@ -838,12 +879,11 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf, unsigned long first_index; unsigned long last_index; int will_write; + int buffered = 0; will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) || (file->f_flags & O_DIRECT)); - nrptrs = min((count + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE, - PAGE_CACHE_SIZE / (sizeof(struct page *))); pinned[0] = NULL; pinned[1] = NULL; @@ -867,13 +907,34 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf, goto out; file_update_time(file); + BTRFS_I(inode)->sequence++; + + if (unlikely(file->f_flags & O_DIRECT)) { + num_written = __btrfs_direct_write(file, buf, count, pos, + ppos); + pos += num_written; + count -= num_written; + + /* We've written everything we wanted to, exit */ + if (num_written < 0 || !count) + goto out; + /* + * We are going to do buffered for the rest of the range, so we + * need to make sure to invalidate the buffered pages when we're + * done. + */ + buffered = 1; + buf += num_written; + } + + nrptrs = min((count + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE, + PAGE_CACHE_SIZE / (sizeof(struct page *))); pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL); /* generic_write_checks can change our pos */ start_pos = pos; - BTRFS_I(inode)->sequence++; first_index = pos >> PAGE_CACHE_SHIFT; last_index = (pos + count) >> PAGE_CACHE_SHIFT; @@ -1007,7 +1068,7 @@ out: btrfs_end_transaction(trans, root); } } - if (file->f_flags & O_DIRECT) { + if (file->f_flags & O_DIRECT && buffered) { invalidate_mapping_pages(inode->i_mapping, start_pos >> PAGE_CACHE_SHIFT, (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 460dd512eebd..1695440a59a4 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -698,6 +698,38 @@ retry: return 0; } +static u64 get_extent_allocation_hint(struct inode *inode, u64 start, + u64 num_bytes) +{ + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct extent_map *em; + u64 alloc_hint = 0; + + read_lock(&em_tree->lock); + em = search_extent_mapping(em_tree, start, num_bytes); + if (em) { + /* + * if block start isn't an actual block number then find the + * first block in this inode and use that as a hint. If that + * block is also bogus then just don't worry about it. + */ + if (em->block_start >= EXTENT_MAP_LAST_BYTE) { + free_extent_map(em); + em = search_extent_mapping(em_tree, 0, 0); + if (em && em->block_start < EXTENT_MAP_LAST_BYTE) + alloc_hint = em->block_start; + if (em) + free_extent_map(em); + } else { + alloc_hint = em->block_start; + free_extent_map(em); + } + } + read_unlock(&em_tree->lock); + + return alloc_hint; +} + /* * when extent_io.c finds a delayed allocation range in the file, * the call backs end up in this code. The basic idea is to @@ -770,29 +802,7 @@ static noinline int cow_file_range(struct inode *inode, BUG_ON(disk_num_bytes > btrfs_super_total_bytes(&root->fs_info->super_copy)); - - read_lock(&BTRFS_I(inode)->extent_tree.lock); - em = search_extent_mapping(&BTRFS_I(inode)->extent_tree, - start, num_bytes); - if (em) { - /* - * if block start isn't an actual block number then find the - * first block in this inode and use that as a hint. If that - * block is also bogus then just don't worry about it. - */ - if (em->block_start >= EXTENT_MAP_LAST_BYTE) { - free_extent_map(em); - em = search_extent_mapping(em_tree, 0, 0); - if (em && em->block_start < EXTENT_MAP_LAST_BYTE) - alloc_hint = em->block_start; - if (em) - free_extent_map(em); - } else { - alloc_hint = em->block_start; - free_extent_map(em); - } - } - read_unlock(&BTRFS_I(inode)->extent_tree.lock); + alloc_hint = get_extent_allocation_hint(inode, start, num_bytes); btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); while (disk_num_bytes > 0) { @@ -5171,11 +5181,440 @@ out: return em; } +static struct extent_map *btrfs_new_extent_direct(struct inode *inode, + u64 start, u64 len) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; + struct extent_map *em; + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct btrfs_key ins; + u64 alloc_hint; + int ret; + + btrfs_drop_extent_cache(inode, start, start + len - 1, 0); + + trans = btrfs_join_transaction(root, 0); + if (!trans) + return ERR_PTR(-ENOMEM); + + trans->block_rsv = &root->fs_info->delalloc_block_rsv; + + alloc_hint = get_extent_allocation_hint(inode, start, len); + ret = btrfs_reserve_extent(trans, root, len, root->sectorsize, 0, + alloc_hint, (u64)-1, &ins, 1); + if (ret) { + em = ERR_PTR(ret); + goto out; + } + + em = alloc_extent_map(GFP_NOFS); + if (!em) { + em = ERR_PTR(-ENOMEM); + goto out; + } + + em->start = start; + em->orig_start = em->start; + em->len = ins.offset; + + em->block_start = ins.objectid; + em->block_len = ins.offset; + em->bdev = root->fs_info->fs_devices->latest_bdev; + set_bit(EXTENT_FLAG_PINNED, &em->flags); + + while (1) { + write_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em); + write_unlock(&em_tree->lock); + if (ret != -EEXIST) + break; + btrfs_drop_extent_cache(inode, start, start + em->len - 1, 0); + } + + ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid, + ins.offset, ins.offset, 0); + if (ret) { + btrfs_free_reserved_extent(root, ins.objectid, ins.offset); + em = ERR_PTR(ret); + } +out: + btrfs_end_transaction(trans, root); + return em; +} + +static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + struct extent_map *em; + struct btrfs_root *root = BTRFS_I(inode)->root; + u64 start = iblock << inode->i_blkbits; + u64 len = bh_result->b_size; + + em = btrfs_get_extent(inode, NULL, 0, start, len, 0); + if (IS_ERR(em)) + return PTR_ERR(em); + + /* + * Ok for INLINE and COMPRESSED extents we need to fallback on buffered + * io. INLINE is special, and we could probably kludge it in here, but + * it's still buffered so for safety lets just fall back to the generic + * buffered path. + * + * For COMPRESSED we _have_ to read the entire extent in so we can + * decompress it, so there will be buffering required no matter what we + * do, so go ahead and fallback to buffered. + * + * We return -ENOTBLK because thats what makes DIO go ahead and go back + * to buffered IO. Don't blame me, this is the price we pay for using + * the generic code. + */ + if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) || + em->block_start == EXTENT_MAP_INLINE) { + free_extent_map(em); + return -ENOTBLK; + } + + /* Just a good old fashioned hole, return */ + if (!create && (em->block_start == EXTENT_MAP_HOLE || + test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { + free_extent_map(em); + /* DIO will do one hole at a time, so just unlock a sector */ + unlock_extent(&BTRFS_I(inode)->io_tree, start, + start + root->sectorsize - 1, GFP_NOFS); + return 0; + } + + /* + * We don't allocate a new extent in the following cases + * + * 1) The inode is marked as NODATACOW. In this case we'll just use the + * existing extent. + * 2) The extent is marked as PREALLOC. We're good to go here and can + * just use the extent. + * + */ + if (!create) + goto map; + + if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || + ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) && + em->block_start != EXTENT_MAP_HOLE)) { + u64 block_start; + int type; + int ret; + + if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) + type = BTRFS_ORDERED_PREALLOC; + else + type = BTRFS_ORDERED_NOCOW; + len = min(len, em->block_len - (start - em->start)); + block_start = em->block_start + (start - em->start); + ret = btrfs_add_ordered_extent_dio(inode, start, + start, len, len, type); + if (ret) { + free_extent_map(em); + return ret; + } + } else { + free_extent_map(em); + em = btrfs_new_extent_direct(inode, start, len); + if (IS_ERR(em)) + return PTR_ERR(em); + len = min(len, em->block_len); + } + unlock_extent(&BTRFS_I(inode)->io_tree, start, start + len - 1, + GFP_NOFS); +map: + bh_result->b_blocknr = (em->block_start + (start - em->start)) >> + inode->i_blkbits; + bh_result->b_size = em->len - (start - em->start); + bh_result->b_bdev = em->bdev; + set_buffer_mapped(bh_result); + if (create && !test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) + set_buffer_new(bh_result); + + free_extent_map(em); + + return 0; +} + +struct btrfs_dio_private { + struct inode *inode; + u64 logical_offset; + u64 disk_bytenr; + u64 bytes; + u32 *csums; + void *private; +}; + +static void btrfs_endio_direct_read(struct bio *bio, int err) +{ + struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1; + struct bio_vec *bvec = bio->bi_io_vec; + struct btrfs_dio_private *dip = bio->bi_private; + struct inode *inode = dip->inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + u64 start; + u32 *private = dip->csums; + + start = dip->logical_offset; + do { + if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { + struct page *page = bvec->bv_page; + char *kaddr; + u32 csum = ~(u32)0; + unsigned long flags; + + local_irq_save(flags); + kaddr = kmap_atomic(page, KM_IRQ0); + csum = btrfs_csum_data(root, kaddr + bvec->bv_offset, + csum, bvec->bv_len); + btrfs_csum_final(csum, (char *)&csum); + kunmap_atomic(kaddr, KM_IRQ0); + local_irq_restore(flags); + + flush_dcache_page(bvec->bv_page); + if (csum != *private) { + printk(KERN_ERR "btrfs csum failed ino %lu off" + " %llu csum %u private %u\n", + inode->i_ino, (unsigned long long)start, + csum, *private); + err = -EIO; + } + } + + start += bvec->bv_len; + private++; + bvec++; + } while (bvec <= bvec_end); + + unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset, + dip->logical_offset + dip->bytes - 1, GFP_NOFS); + bio->bi_private = dip->private; + + kfree(dip->csums); + kfree(dip); + dio_end_io(bio, err); +} + +static void btrfs_endio_direct_write(struct bio *bio, int err) +{ + struct btrfs_dio_private *dip = bio->bi_private; + struct inode *inode = dip->inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; + struct btrfs_ordered_extent *ordered = NULL; + struct extent_state *cached_state = NULL; + int ret; + + if (err) + goto out_done; + + ret = btrfs_dec_test_ordered_pending(inode, &ordered, + dip->logical_offset, dip->bytes); + if (!ret) + goto out_done; + + BUG_ON(!ordered); + + trans = btrfs_join_transaction(root, 1); + if (!trans) { + err = -ENOMEM; + goto out; + } + trans->block_rsv = &root->fs_info->delalloc_block_rsv; + + if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) { + ret = btrfs_ordered_update_i_size(inode, 0, ordered); + if (!ret) + ret = btrfs_update_inode(trans, root, inode); + err = ret; + goto out; + } + + lock_extent_bits(&BTRFS_I(inode)->io_tree, ordered->file_offset, + ordered->file_offset + ordered->len - 1, 0, + &cached_state, GFP_NOFS); + + if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) { + ret = btrfs_mark_extent_written(trans, inode, + ordered->file_offset, + ordered->file_offset + + ordered->len); + if (ret) { + err = ret; + goto out_unlock; + } + } else { + ret = insert_reserved_file_extent(trans, inode, + ordered->file_offset, + ordered->start, + ordered->disk_len, + ordered->len, + ordered->len, + 0, 0, 0, + BTRFS_FILE_EXTENT_REG); + unpin_extent_cache(&BTRFS_I(inode)->extent_tree, + ordered->file_offset, ordered->len); + if (ret) { + err = ret; + WARN_ON(1); + goto out_unlock; + } + } + + add_pending_csums(trans, inode, ordered->file_offset, &ordered->list); + btrfs_ordered_update_i_size(inode, 0, ordered); + btrfs_update_inode(trans, root, inode); +out_unlock: + unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset, + ordered->file_offset + ordered->len - 1, + &cached_state, GFP_NOFS); +out: + btrfs_delalloc_release_metadata(inode, ordered->len); + btrfs_end_transaction(trans, root); + btrfs_put_ordered_extent(ordered); + btrfs_put_ordered_extent(ordered); +out_done: + bio->bi_private = dip->private; + + kfree(dip->csums); + kfree(dip); + dio_end_io(bio, err); +} + +static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode, + loff_t file_offset) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_dio_private *dip; + struct bio_vec *bvec = bio->bi_io_vec; + u64 start; + int skip_sum; + int write = rw & (1 << BIO_RW); + int ret = 0; + + skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; + + dip = kmalloc(sizeof(*dip), GFP_NOFS); + if (!dip) { + ret = -ENOMEM; + goto free_ordered; + } + dip->csums = NULL; + + if (!skip_sum) { + dip->csums = kmalloc(sizeof(u32) * bio->bi_vcnt, GFP_NOFS); + if (!dip->csums) { + ret = -ENOMEM; + goto free_ordered; + } + } + + dip->private = bio->bi_private; + dip->inode = inode; + dip->logical_offset = file_offset; + + start = dip->logical_offset; + dip->bytes = 0; + do { + dip->bytes += bvec->bv_len; + bvec++; + } while (bvec <= (bio->bi_io_vec + bio->bi_vcnt - 1)); + + dip->disk_bytenr = bio->bi_sector << 9; + bio->bi_private = dip; + + if (write) + bio->bi_end_io = btrfs_endio_direct_write; + else + bio->bi_end_io = btrfs_endio_direct_read; + + ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); + if (ret) + goto out_err; + + if (write && !skip_sum) + btrfs_csum_one_bio(root, inode, bio, dip->logical_offset, 1); + else if (!skip_sum) + btrfs_lookup_bio_sums_dio(root, inode, bio, + dip->logical_offset, dip->csums); + + ret = btrfs_map_bio(root, rw, bio, 0, 0); + if (ret) + goto out_err; + return; +out_err: + kfree(dip->csums); + kfree(dip); +free_ordered: + /* + * If this is a write, we need to clean up the reserved space and kill + * the ordered extent. + */ + if (write) { + struct btrfs_ordered_extent *ordered; + ordered = btrfs_lookup_ordered_extent(inode, + dip->logical_offset); + if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) && + !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) + btrfs_free_reserved_extent(root, ordered->start, + ordered->disk_len); + btrfs_put_ordered_extent(ordered); + btrfs_put_ordered_extent(ordered); + } + bio_endio(bio, ret); +} + static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t offset, unsigned long nr_segs) { - return -EINVAL; + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + struct btrfs_ordered_extent *ordered; + u64 lockstart, lockend; + ssize_t ret; + + lockstart = offset; + lockend = offset + iov_length(iov, nr_segs) - 1; + while (1) { + lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, + GFP_NOFS); + /* + * We're concerned with the entire range that we're going to be + * doing DIO to, so we need to make sure theres no ordered + * extents in this range. + */ + ordered = btrfs_lookup_ordered_range(inode, lockstart, + lockend - lockstart + 1); + if (!ordered) + break; + unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, + GFP_NOFS); + btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_put_ordered_extent(ordered); + cond_resched(); + } + + ret = __blockdev_direct_IO(rw, iocb, inode, NULL, iov, offset, nr_segs, + btrfs_get_blocks_direct, NULL, + btrfs_submit_direct, 0); + + if (ret < 0 && ret != -EIOCBQUEUED) { + unlock_extent(&BTRFS_I(inode)->io_tree, offset, + offset + iov_length(iov, nr_segs) - 1, GFP_NOFS); + } else if (ret >= 0 && ret < iov_length(iov, nr_segs)) { + /* + * We're falling back to buffered, unlock the section we didn't + * do IO on. + */ + unlock_extent(&BTRFS_I(inode)->io_tree, offset + ret, + offset + iov_length(iov, nr_segs) - 1, GFP_NOFS); + } + + return ret; } static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index c9f1020572f2..e56c72bc5add 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -124,6 +124,15 @@ static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset) return 1; } +static int range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset, + u64 len) +{ + if (file_offset + len <= entry->file_offset || + entry->file_offset + entry->len <= file_offset) + return 0; + return 1; +} + /* * look find the first ordered struct that has this offset, otherwise * the first one less than this offset @@ -161,8 +170,9 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree, * The tree is given a single reference on the ordered extent that was * inserted. */ -int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, - u64 start, u64 len, u64 disk_len, int type) +static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, + u64 start, u64 len, u64 disk_len, + int type, int dio) { struct btrfs_ordered_inode_tree *tree; struct rb_node *node; @@ -182,6 +192,9 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE) set_bit(type, &entry->flags); + if (dio) + set_bit(BTRFS_ORDERED_DIRECT, &entry->flags); + /* one ref for the tree */ atomic_set(&entry->refs, 1); init_waitqueue_head(&entry->wait); @@ -203,6 +216,20 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, return 0; } +int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, + u64 start, u64 len, u64 disk_len, int type) +{ + return __btrfs_add_ordered_extent(inode, file_offset, start, len, + disk_len, type, 0); +} + +int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset, + u64 start, u64 len, u64 disk_len, int type) +{ + return __btrfs_add_ordered_extent(inode, file_offset, start, len, + disk_len, type, 1); +} + /* * Add a struct btrfs_ordered_sum into the list of checksums to be inserted * when an ordered extent is finished. If the list covers more than one @@ -484,7 +511,8 @@ void btrfs_start_ordered_extent(struct inode *inode, * start IO on any dirty ones so the wait doesn't stall waiting * for pdflush to find them */ - filemap_fdatawrite_range(inode->i_mapping, start, end); + if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags)) + filemap_fdatawrite_range(inode->i_mapping, start, end); if (wait) { wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE, &entry->flags)); @@ -581,6 +609,47 @@ out: return entry; } +/* Since the DIO code tries to lock a wide area we need to look for any ordered + * extents that exist in the range, rather than just the start of the range. + */ +struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode, + u64 file_offset, + u64 len) +{ + struct btrfs_ordered_inode_tree *tree; + struct rb_node *node; + struct btrfs_ordered_extent *entry = NULL; + + tree = &BTRFS_I(inode)->ordered_tree; + spin_lock(&tree->lock); + node = tree_search(tree, file_offset); + if (!node) { + node = tree_search(tree, file_offset + len); + if (!node) + goto out; + } + + while (1) { + entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); + if (range_overlaps(entry, file_offset, len)) + break; + + if (entry->file_offset >= file_offset + len) { + entry = NULL; + break; + } + entry = NULL; + node = rb_next(node); + if (!node) + break; + } +out: + if (entry) + atomic_inc(&entry->refs); + spin_unlock(&tree->lock); + return entry; +} + /* * lookup and return any extent before 'file_offset'. NULL is returned * if none is found diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index c82f76a9f040..8ac365492a3f 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -72,6 +72,8 @@ struct btrfs_ordered_sum { #define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */ +#define BTRFS_ORDERED_DIRECT 5 /* set when we're doing DIO with this extent */ + struct btrfs_ordered_extent { /* logical offset in the file */ u64 file_offset; @@ -140,7 +142,9 @@ int btrfs_dec_test_ordered_pending(struct inode *inode, struct btrfs_ordered_extent **cached, u64 file_offset, u64 io_size); int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, - u64 start, u64 len, u64 disk_len, int tyep); + u64 start, u64 len, u64 disk_len, int type); +int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset, + u64 start, u64 len, u64 disk_len, int type); int btrfs_add_ordered_sum(struct inode *inode, struct btrfs_ordered_extent *entry, struct btrfs_ordered_sum *sum); @@ -151,6 +155,9 @@ void btrfs_start_ordered_extent(struct inode *inode, int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len); struct btrfs_ordered_extent * btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset); +struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode, + u64 file_offset, + u64 len); int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, struct btrfs_ordered_extent *ordered); int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum); -- cgit v1.2.3 From 11c65dccf70be9ace5dbd3906778e1a099b1fee1 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Sun, 23 May 2010 11:07:21 -0400 Subject: Btrfs: do aio_write instead of write In order for AIO to work, we need to implement aio_write. This patch converts our btrfs_file_write to btrfs_aio_write. I've tested this with xfstests and nothing broke, and the AIO stuff magically started working. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/extent_io.c | 11 +++- fs/btrfs/file.c | 176 +++++++++++++++++++++++++++------------------------ 2 files changed, 104 insertions(+), 83 deletions(-) (limited to 'fs/btrfs/file.c') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 1a57c17d4029..a53aca338c7f 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2017,6 +2017,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree, sector_t sector; struct extent_map *em; struct block_device *bdev; + struct btrfs_ordered_extent *ordered; int ret; int nr = 0; size_t page_offset = 0; @@ -2028,7 +2029,15 @@ static int __extent_read_full_page(struct extent_io_tree *tree, set_page_extent_mapped(page); end = page_end; - lock_extent(tree, start, end, GFP_NOFS); + while (1) { + lock_extent(tree, start, end, GFP_NOFS); + ordered = btrfs_lookup_ordered_extent(inode, start); + if (!ordered) + break; + unlock_extent(tree, start, end, GFP_NOFS); + btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_put_ordered_extent(ordered); + } if (page->index == last_byte >> PAGE_CACHE_SHIFT) { char *userpage; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index a28810abfb98..233aea2e5ef2 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -46,32 +46,42 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages, int write_bytes, struct page **prepared_pages, - const char __user *buf) + struct iov_iter *i) { - long page_fault = 0; - int i; + size_t copied; + int pg = 0; int offset = pos & (PAGE_CACHE_SIZE - 1); - for (i = 0; i < num_pages && write_bytes > 0; i++, offset = 0) { + while (write_bytes > 0) { size_t count = min_t(size_t, PAGE_CACHE_SIZE - offset, write_bytes); - struct page *page = prepared_pages[i]; - fault_in_pages_readable(buf, count); + struct page *page = prepared_pages[pg]; +again: + if (unlikely(iov_iter_fault_in_readable(i, count))) + return -EFAULT; /* Copy data from userspace to the current page */ - kmap(page); - page_fault = __copy_from_user(page_address(page) + offset, - buf, count); + copied = iov_iter_copy_from_user(page, i, offset, count); + /* Flush processor's dcache for this page */ flush_dcache_page(page); - kunmap(page); - buf += count; - write_bytes -= count; + iov_iter_advance(i, copied); + write_bytes -= copied; - if (page_fault) - break; + if (unlikely(copied == 0)) { + count = min_t(size_t, PAGE_CACHE_SIZE - offset, + iov_iter_single_seg_count(i)); + goto again; + } + + if (unlikely(copied < PAGE_CACHE_SIZE - offset)) { + offset += copied; + } else { + pg++; + offset = 0; + } } - return page_fault ? -EFAULT : 0; + return 0; } /* @@ -822,60 +832,24 @@ again: return 0; } -/* Copied from read-write.c */ -static void wait_on_retry_sync_kiocb(struct kiocb *iocb) -{ - set_current_state(TASK_UNINTERRUPTIBLE); - if (!kiocbIsKicked(iocb)) - schedule(); - else - kiocbClearKicked(iocb); - __set_current_state(TASK_RUNNING); -} - -/* - * Just a copy of what do_sync_write does. - */ -static ssize_t __btrfs_direct_write(struct file *file, const char __user *buf, - size_t count, loff_t pos, loff_t *ppos) +static ssize_t btrfs_file_aio_write(struct kiocb *iocb, + const struct iovec *iov, + unsigned long nr_segs, loff_t pos) { - struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = count }; - unsigned long nr_segs = 1; - struct kiocb kiocb; - ssize_t ret; - - init_sync_kiocb(&kiocb, file); - kiocb.ki_pos = pos; - kiocb.ki_left = count; - kiocb.ki_nbytes = count; - - while (1) { - ret = generic_file_direct_write(&kiocb, &iov, &nr_segs, pos, - ppos, count, count); - if (ret != -EIOCBRETRY) - break; - wait_on_retry_sync_kiocb(&kiocb); - } - - if (ret == -EIOCBQUEUED) - ret = wait_on_sync_kiocb(&kiocb); - *ppos = kiocb.ki_pos; - return ret; -} - -static ssize_t btrfs_file_write(struct file *file, const char __user *buf, - size_t count, loff_t *ppos) -{ - loff_t pos; + struct file *file = iocb->ki_filp; + struct inode *inode = fdentry(file)->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct page *pinned[2]; + struct page **pages = NULL; + struct iov_iter i; + loff_t *ppos = &iocb->ki_pos; loff_t start_pos; ssize_t num_written = 0; ssize_t err = 0; + size_t count; + size_t ocount; int ret = 0; - struct inode *inode = fdentry(file)->d_inode; - struct btrfs_root *root = BTRFS_I(inode)->root; - struct page **pages = NULL; int nrptrs; - struct page *pinned[2]; unsigned long first_index; unsigned long last_index; int will_write; @@ -887,13 +861,17 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf, pinned[0] = NULL; pinned[1] = NULL; - pos = *ppos; start_pos = pos; vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); mutex_lock(&inode->i_mutex); + err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ); + if (err) + goto out; + count = ocount; + current->backing_dev_info = inode->i_mapping->backing_dev_info; err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); if (err) @@ -910,14 +888,48 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf, BTRFS_I(inode)->sequence++; if (unlikely(file->f_flags & O_DIRECT)) { - num_written = __btrfs_direct_write(file, buf, count, pos, - ppos); - pos += num_written; - count -= num_written; + ret = btrfs_delalloc_reserve_space(inode, count); + if (ret) + goto out; - /* We've written everything we wanted to, exit */ - if (num_written < 0 || !count) + num_written = generic_file_direct_write(iocb, iov, &nr_segs, + pos, ppos, count, + ocount); + + /* + * the generic O_DIRECT will update in-memory i_size after the + * DIOs are done. But our endio handlers that update the on + * disk i_size never update past the in memory i_size. So we + * need one more update here to catch any additions to the + * file + */ + if (inode->i_size != BTRFS_I(inode)->disk_i_size) { + btrfs_ordered_update_i_size(inode, inode->i_size, NULL); + mark_inode_dirty(inode); + } + + if (num_written < 0) { + if (num_written != -EIOCBQUEUED) { + /* + * aio land will take care of releasing the + * delalloc + */ + btrfs_delalloc_release_space(inode, count); + } + ret = num_written; + num_written = 0; goto out; + } else if (num_written == count) { + /* pick up pos changes done by the generic code */ + pos = *ppos; + goto out; + } + + /* + * the buffered IO will reserve bytes for the rest of the + * range, don't double count them here + */ + btrfs_delalloc_release_space(inode, count - num_written); /* * We are going to do buffered for the rest of the range, so we @@ -925,18 +937,20 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf, * done. */ buffered = 1; - buf += num_written; + pos += num_written; } - nrptrs = min((count + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE, - PAGE_CACHE_SIZE / (sizeof(struct page *))); + iov_iter_init(&i, iov, nr_segs, count, num_written); + nrptrs = min((iov_iter_count(&i) + PAGE_CACHE_SIZE - 1) / + PAGE_CACHE_SIZE, PAGE_CACHE_SIZE / + (sizeof(struct page *))); pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL); /* generic_write_checks can change our pos */ start_pos = pos; first_index = pos >> PAGE_CACHE_SHIFT; - last_index = (pos + count) >> PAGE_CACHE_SHIFT; + last_index = (pos + iov_iter_count(&i)) >> PAGE_CACHE_SHIFT; /* * there are lots of better ways to do this, but this code @@ -953,7 +967,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf, unlock_page(pinned[0]); } } - if ((pos + count) & (PAGE_CACHE_SIZE - 1)) { + if ((pos + iov_iter_count(&i)) & (PAGE_CACHE_SIZE - 1)) { pinned[1] = grab_cache_page(inode->i_mapping, last_index); if (!PageUptodate(pinned[1])) { ret = btrfs_readpage(NULL, pinned[1]); @@ -964,10 +978,10 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf, } } - while (count > 0) { + while (iov_iter_count(&i) > 0) { size_t offset = pos & (PAGE_CACHE_SIZE - 1); - size_t write_bytes = min(count, nrptrs * - (size_t)PAGE_CACHE_SIZE - + size_t write_bytes = min(iov_iter_count(&i), + nrptrs * (size_t)PAGE_CACHE_SIZE - offset); size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; @@ -988,7 +1002,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf, } ret = btrfs_copy_from_user(pos, num_pages, - write_bytes, pages, buf); + write_bytes, pages, &i); if (ret == 0) { dirty_and_release_pages(NULL, root, file, pages, num_pages, pos, write_bytes); @@ -1012,8 +1026,6 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf, btrfs_throttle(root); } - buf += write_bytes; - count -= write_bytes; pos += write_bytes; num_written += write_bytes; @@ -1206,7 +1218,7 @@ const struct file_operations btrfs_file_operations = { .read = do_sync_read, .aio_read = generic_file_aio_read, .splice_read = generic_file_splice_read, - .write = btrfs_file_write, + .aio_write = btrfs_file_aio_write, .mmap = btrfs_file_mmap, .open = generic_file_open, .release = btrfs_release_file, -- cgit v1.2.3 From 4845e44ffdb26be9b25610664228e8ecaf949a0d Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 25 May 2010 20:56:50 -0400 Subject: Btrfs: rework O_DIRECT enospc handling This changes O_DIRECT write code to mark extents as delalloc while it is processing them. Yan Zheng has reworked the enospc accounting based on tracking delalloc extents and this makes it much easier to track enospc in the O_DIRECT code. There are a few space cases with the O_DIRECT code though, it only sets the EXTENT_DELALLOC bits, instead of doing EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE, because we don't want to mess with clearing the dirty and uptodate bits when things go wrong. This is important because there are no pages in the page cache, so any extent state structs that we put in the tree won't get freed by releasepage. We have to clear them ourselves as the DIO ends. With this commit, we reserve space at in btrfs_file_aio_write, and then as each btrfs_direct_IO call progresses it sets EXTENT_DELALLOC on the range. btrfs_get_blocks_direct is responsible for clearing the delalloc at the same time it drops the extent lock. Signed-off-by: Chris Mason --- fs/btrfs/extent_io.c | 9 ++++----- fs/btrfs/extent_io.h | 4 ++++ fs/btrfs/file.c | 14 -------------- fs/btrfs/inode.c | 52 +++++++++++++++++++++++++++++++++++++++++----------- 4 files changed, 49 insertions(+), 30 deletions(-) (limited to 'fs/btrfs/file.c') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 15392af21bfb..a4080c21ec55 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -135,7 +135,7 @@ static struct extent_state *alloc_extent_state(gfp_t mask) return state; } -static void free_extent_state(struct extent_state *state) +void free_extent_state(struct extent_state *state) { if (!state) return; @@ -745,10 +745,9 @@ static void cache_state(struct extent_state *state, * [start, end] is inclusive This takes the tree lock. */ -static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - int bits, int exclusive_bits, u64 *failed_start, - struct extent_state **cached_state, - gfp_t mask) +int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + int bits, int exclusive_bits, u64 *failed_start, + struct extent_state **cached_state, gfp_t mask) { struct extent_state *state; struct extent_state *prealloc = NULL; diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 86c7b341d070..5691c7b590da 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -178,6 +178,7 @@ u64 count_range_bits(struct extent_io_tree *tree, u64 *start, u64 search_end, u64 max_bytes, unsigned long bits); +void free_extent_state(struct extent_state *state); int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits, int filled, struct extent_state *cached_state); int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, @@ -187,6 +188,9 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, int bits, gfp_t mask); +int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + int bits, int exclusive_bits, u64 *failed_start, + struct extent_state **cached_state, gfp_t mask); int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 233aea2e5ef2..54556cae4497 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -909,13 +909,6 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, } if (num_written < 0) { - if (num_written != -EIOCBQUEUED) { - /* - * aio land will take care of releasing the - * delalloc - */ - btrfs_delalloc_release_space(inode, count); - } ret = num_written; num_written = 0; goto out; @@ -924,13 +917,6 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, pos = *ppos; goto out; } - - /* - * the buffered IO will reserve bytes for the rest of the - * range, don't double count them here - */ - btrfs_delalloc_release_space(inode, count - num_written); - /* * We are going to do buffered for the rest of the range, so we * need to make sure to invalidate the buffered pages when we're diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 13a4aa222861..00aefbdcc2df 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5327,8 +5327,9 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, return PTR_ERR(em); len = min(len, em->block_len); } - unlock_extent(&BTRFS_I(inode)->io_tree, start, start + len - 1, - GFP_NOFS); + clear_extent_bit(&BTRFS_I(inode)->io_tree, start, start + len - 1, + EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DIRTY, 1, + 0, NULL, GFP_NOFS); map: bh_result->b_blocknr = (em->block_start + (start - em->start)) >> inode->i_blkbits; @@ -5596,14 +5597,18 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; struct btrfs_ordered_extent *ordered; + struct extent_state *cached_state = NULL; u64 lockstart, lockend; ssize_t ret; + int writing = rw & WRITE; + int write_bits = 0; lockstart = offset; lockend = offset + iov_length(iov, nr_segs) - 1; + while (1) { - lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, - GFP_NOFS); + lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, + 0, &cached_state, GFP_NOFS); /* * We're concerned with the entire range that we're going to be * doing DIO to, so we need to make sure theres no ordered @@ -5613,29 +5618,54 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, lockend - lockstart + 1); if (!ordered) break; - unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, - GFP_NOFS); + unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, + &cached_state, GFP_NOFS); btrfs_start_ordered_extent(inode, ordered, 1); btrfs_put_ordered_extent(ordered); cond_resched(); } + /* + * we don't use btrfs_set_extent_delalloc because we don't want + * the dirty or uptodate bits + */ + if (writing) { + write_bits = EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING; + ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, + EXTENT_DELALLOC, 0, NULL, &cached_state, + GFP_NOFS); + if (ret) { + clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, + lockend, EXTENT_LOCKED | write_bits, + 1, 0, &cached_state, GFP_NOFS); + goto out; + } + } + + free_extent_state(cached_state); + cached_state = NULL; + ret = __blockdev_direct_IO(rw, iocb, inode, NULL, iov, offset, nr_segs, btrfs_get_blocks_direct, NULL, btrfs_submit_direct, 0); if (ret < 0 && ret != -EIOCBQUEUED) { - unlock_extent(&BTRFS_I(inode)->io_tree, offset, - offset + iov_length(iov, nr_segs) - 1, GFP_NOFS); + clear_extent_bit(&BTRFS_I(inode)->io_tree, offset, + offset + iov_length(iov, nr_segs) - 1, + EXTENT_LOCKED | write_bits, 1, 0, + &cached_state, GFP_NOFS); } else if (ret >= 0 && ret < iov_length(iov, nr_segs)) { /* * We're falling back to buffered, unlock the section we didn't * do IO on. */ - unlock_extent(&BTRFS_I(inode)->io_tree, offset + ret, - offset + iov_length(iov, nr_segs) - 1, GFP_NOFS); + clear_extent_bit(&BTRFS_I(inode)->io_tree, offset + ret, + offset + iov_length(iov, nr_segs) - 1, + EXTENT_LOCKED | write_bits, 1, 0, + &cached_state, GFP_NOFS); } - +out: + free_extent_state(cached_state); return ret; } -- cgit v1.2.3 From 3f7c579c41a3d20af76fd6ff1f6b949edf105fd1 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 26 May 2010 10:59:53 -0400 Subject: Btrfs: move O_DIRECT space reservation to btrfs_direct_IO This moves the delalloc space reservation done for O_DIRECT into btrfs_direct_IO. This way we don't leak reserved space if the generic O_DIRECT write code errors out before it calls into btrfs_direct_IO. Signed-off-by: Chris Mason --- fs/btrfs/file.c | 5 ----- fs/btrfs/inode.c | 9 ++++++++- 2 files changed, 8 insertions(+), 6 deletions(-) (limited to 'fs/btrfs/file.c') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 54556cae4497..79437c5eeb1e 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -888,14 +888,9 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, BTRFS_I(inode)->sequence++; if (unlikely(file->f_flags & O_DIRECT)) { - ret = btrfs_delalloc_reserve_space(inode, count); - if (ret) - goto out; - num_written = generic_file_direct_write(iocb, iov, &nr_segs, pos, ppos, count, ocount); - /* * the generic O_DIRECT will update in-memory i_size after the * DIOs are done. But our endio handlers that update the on diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 00aefbdcc2df..ca9d5501d340 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5602,9 +5602,16 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, ssize_t ret; int writing = rw & WRITE; int write_bits = 0; + size_t count = iov_length(iov, nr_segs); lockstart = offset; - lockend = offset + iov_length(iov, nr_segs) - 1; + lockend = offset + count - 1; + + if (writing) { + ret = btrfs_delalloc_reserve_space(inode, count); + if (ret) + goto out; + } while (1) { lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, -- cgit v1.2.3