diff options
Diffstat (limited to 'fs/btrfs/file.c')
-rw-r--r-- | fs/btrfs/file.c | 561 |
1 files changed, 382 insertions, 179 deletions
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 82d0342763c5..ff1cc0399b9a 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -40,6 +40,7 @@ #include "tree-log.h" #include "locking.h" #include "volumes.h" +#include "qgroup.h" static struct kmem_cache *btrfs_inode_defrag_cachep; /* @@ -425,13 +426,8 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages, struct page *page = prepared_pages[pg]; /* * Copy data from userspace to the current page - * - * Disable pagefault to avoid recursive lock since - * the pages are already locked */ - pagefault_disable(); copied = iov_iter_copy_from_user_atomic(page, i, offset, count); - pagefault_enable(); /* Flush processor's dcache for this page */ flush_dcache_page(page); @@ -452,7 +448,7 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages, write_bytes -= copied; total_copied += copied; - /* Return to btrfs_file_aio_write to fault page */ + /* Return to btrfs_file_write_iter to fault page */ if (unlikely(copied == 0)) break; @@ -475,11 +471,12 @@ static void btrfs_drop_pages(struct page **pages, size_t num_pages) for (i = 0; i < num_pages; i++) { /* page checked is some magic around finding pages that * have been modified without going through btrfs_set_page_dirty - * clear it here + * clear it here. There should be no need to mark the pages + * accessed as prepare_pages should have marked them accessed + * in prepare_pages via find_or_create_page() */ ClearPageChecked(pages[i]); unlock_page(pages[i]); - mark_page_accessed(pages[i]); page_cache_release(pages[i]); } } @@ -591,7 +588,6 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, clear_bit(EXTENT_FLAG_PINNED, &em->flags); clear_bit(EXTENT_FLAG_LOGGING, &flags); modified = !list_empty(&em->list); - remove_extent_mapping(em_tree, em); if (no_splits) goto next; @@ -622,8 +618,7 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, split->bdev = em->bdev; split->flags = flags; split->compress_type = em->compress_type; - ret = add_extent_mapping(em_tree, split, modified); - BUG_ON(ret); /* Logic error */ + replace_extent_mapping(em_tree, em, split, modified); free_extent_map(split); split = split2; split2 = NULL; @@ -661,12 +656,20 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, split->orig_block_len = 0; } - ret = add_extent_mapping(em_tree, split, modified); - BUG_ON(ret); /* Logic error */ + if (extent_map_in_tree(em)) { + replace_extent_mapping(em_tree, em, split, + modified); + } else { + ret = add_extent_mapping(em_tree, split, + modified); + ASSERT(ret == 0); /* Logic error */ + } free_extent_map(split); split = NULL; } next: + if (extent_map_in_tree(em)) + remove_extent_mapping(em_tree, em); write_unlock(&em_tree->lock); /* once for us */ @@ -692,7 +695,10 @@ next: int __btrfs_drop_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, struct btrfs_path *path, u64 start, u64 end, - u64 *drop_end, int drop_cache) + u64 *drop_end, int drop_cache, + int replace_extent, + u32 extent_item_size, + int *key_inserted) { struct extent_buffer *leaf; struct btrfs_file_extent_item *fi; @@ -710,15 +716,18 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans, int recow; int ret; int modify_tree = -1; - int update_refs = (root->ref_cows || root == root->fs_info->tree_root); + int update_refs; int found = 0; + int leafs_visited = 0; if (drop_cache) btrfs_drop_extent_cache(inode, start, end - 1, 0); - if (start >= BTRFS_I(inode)->disk_i_size) + if (start >= BTRFS_I(inode)->disk_i_size && !replace_extent) modify_tree = 0; + update_refs = (test_bit(BTRFS_ROOT_REF_COWS, &root->state) || + root == root->fs_info->tree_root); while (1) { recow = 0; ret = btrfs_lookup_file_extent(trans, root, path, ino, @@ -733,6 +742,7 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans, path->slots[0]--; } ret = 0; + leafs_visited++; next_slot: leaf = path->nodes[0]; if (path->slots[0] >= btrfs_header_nritems(leaf)) { @@ -744,6 +754,7 @@ next_slot: ret = 0; break; } + leafs_visited++; leaf = path->nodes[0]; recow = 1; } @@ -766,12 +777,25 @@ next_slot: btrfs_file_extent_num_bytes(leaf, fi); } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { extent_end = key.offset + - btrfs_file_extent_inline_len(leaf, fi); + btrfs_file_extent_inline_len(leaf, + path->slots[0], fi); } else { WARN_ON(1); extent_end = search_start; } + /* + * Don't skip extent items representing 0 byte lengths. They + * used to be created (bug) if while punching holes we hit + * -ENOSPC condition. So if we find one here, just ensure we + * delete it, otherwise we would insert a new file extent item + * with the same key (offset) as that 0 bytes length file + * extent item in the call to setup_items_for_insert() later + * in this function. + */ + if (extent_end == key.offset && extent_end >= search_start) + goto delete_extent_item; + if (extent_end <= search_start) { path->slots[0]++; goto next_slot; @@ -791,7 +815,10 @@ next_slot: */ if (start > key.offset && end < extent_end) { BUG_ON(del_nr > 0); - BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE); + if (extent_type == BTRFS_FILE_EXTENT_INLINE) { + ret = -EOPNOTSUPP; + break; + } memcpy(&new_key, &key, sizeof(new_key)); new_key.offset = start; @@ -824,7 +851,7 @@ next_slot: disk_bytenr, num_bytes, 0, root->root_key.objectid, new_key.objectid, - start - extent_offset, 0); + start - extent_offset, 1); BUG_ON(ret); /* -ENOMEM */ } key.offset = start; @@ -834,7 +861,10 @@ next_slot: * | -------- extent -------- | */ if (start <= key.offset && end < extent_end) { - BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE); + if (extent_type == BTRFS_FILE_EXTENT_INLINE) { + ret = -EOPNOTSUPP; + break; + } memcpy(&new_key, &key, sizeof(new_key)); new_key.offset = end; @@ -857,7 +887,10 @@ next_slot: */ if (start > key.offset && end >= extent_end) { BUG_ON(del_nr > 0); - BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE); + if (extent_type == BTRFS_FILE_EXTENT_INLINE) { + ret = -EOPNOTSUPP; + break; + } btrfs_set_file_extent_num_bytes(leaf, fi, start - key.offset); @@ -876,6 +909,7 @@ next_slot: * | ------ extent ------ | */ if (start <= key.offset && end >= extent_end) { +delete_extent_item: if (del_nr == 0) { del_slot = path->slots[0]; del_nr = 1; @@ -927,14 +961,52 @@ next_slot: } if (!ret && del_nr > 0) { + /* + * Set path->slots[0] to first slot, so that after the delete + * if items are move off from our leaf to its immediate left or + * right neighbor leafs, we end up with a correct and adjusted + * path->slots[0] for our insertion (if replace_extent != 0). + */ + path->slots[0] = del_slot; ret = btrfs_del_items(trans, root, path, del_slot, del_nr); if (ret) btrfs_abort_transaction(trans, root, ret); } + leaf = path->nodes[0]; + /* + * If btrfs_del_items() was called, it might have deleted a leaf, in + * which case it unlocked our path, so check path->locks[0] matches a + * write lock. + */ + if (!ret && replace_extent && leafs_visited == 1 && + (path->locks[0] == BTRFS_WRITE_LOCK_BLOCKING || + path->locks[0] == BTRFS_WRITE_LOCK) && + btrfs_leaf_free_space(root, leaf) >= + sizeof(struct btrfs_item) + extent_item_size) { + + key.objectid = ino; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = start; + if (!del_nr && path->slots[0] < btrfs_header_nritems(leaf)) { + struct btrfs_key slot_key; + + btrfs_item_key_to_cpu(leaf, &slot_key, path->slots[0]); + if (btrfs_comp_cpu_keys(&key, &slot_key) > 0) + path->slots[0]++; + } + setup_items_for_insert(root, path, &key, + &extent_item_size, + extent_item_size, + sizeof(struct btrfs_item) + + extent_item_size, 1); + *key_inserted = 1; + } + + if (!replace_extent || !(*key_inserted)) + btrfs_release_path(path); if (drop_end) *drop_end = found ? min(end, extent_end) : end; - btrfs_release_path(path); return ret; } @@ -949,7 +1021,7 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; ret = __btrfs_drop_extents(trans, root, inode, path, start, end, NULL, - drop_cache); + drop_cache, 0, 0, NULL); btrfs_free_path(path); return ret; } @@ -1136,7 +1208,7 @@ again: ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0, root->root_key.objectid, - ino, orig_offset, 0); + ino, orig_offset, 1); BUG_ON(ret); /* -ENOMEM */ if (split == start) { @@ -1235,29 +1307,18 @@ static int prepare_uptodate_page(struct page *page, u64 pos, } /* - * this gets pages into the page cache and locks them down, it also properly - * waits for data=ordered extents to finish before allowing the pages to be - * modified. + * this just gets pages into the page cache and locks them down. */ -static noinline int prepare_pages(struct btrfs_root *root, struct file *file, - struct page **pages, size_t num_pages, - loff_t pos, unsigned long first_index, - size_t write_bytes, bool force_uptodate) +static noinline int prepare_pages(struct inode *inode, struct page **pages, + size_t num_pages, loff_t pos, + size_t write_bytes, bool force_uptodate) { - struct extent_state *cached_state = NULL; int i; unsigned long index = pos >> PAGE_CACHE_SHIFT; - struct inode *inode = file_inode(file); gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); int err = 0; - int faili = 0; - u64 start_pos; - u64 last_pos; - - start_pos = pos & ~((u64)root->sectorsize - 1); - last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT; + int faili; -again: for (i = 0; i < num_pages; i++) { pages[i] = find_or_create_page(inode->i_mapping, index + i, mask | __GFP_WRITE); @@ -1280,57 +1341,82 @@ again: } wait_on_page_writeback(pages[i]); } - faili = num_pages - 1; - err = 0; + + return 0; +fail: + while (faili >= 0) { + unlock_page(pages[faili]); + page_cache_release(pages[faili]); + faili--; + } + return err; + +} + +/* + * This function locks the extent and properly waits for data=ordered extents + * to finish before allowing the pages to be modified if need. + * + * The return value: + * 1 - the extent is locked + * 0 - the extent is not locked, and everything is OK + * -EAGAIN - need re-prepare the pages + * the other < 0 number - Something wrong happens + */ +static noinline int +lock_and_cleanup_extent_if_need(struct inode *inode, struct page **pages, + size_t num_pages, loff_t pos, + u64 *lockstart, u64 *lockend, + struct extent_state **cached_state) +{ + u64 start_pos; + u64 last_pos; + int i; + int ret = 0; + + start_pos = pos & ~((u64)PAGE_CACHE_SIZE - 1); + last_pos = start_pos + ((u64)num_pages << PAGE_CACHE_SHIFT) - 1; + if (start_pos < inode->i_size) { struct btrfs_ordered_extent *ordered; lock_extent_bits(&BTRFS_I(inode)->io_tree, - start_pos, last_pos - 1, 0, &cached_state); - ordered = btrfs_lookup_first_ordered_extent(inode, - last_pos - 1); + start_pos, last_pos, 0, cached_state); + ordered = btrfs_lookup_ordered_range(inode, start_pos, + last_pos - start_pos + 1); if (ordered && ordered->file_offset + ordered->len > start_pos && - ordered->file_offset < last_pos) { - btrfs_put_ordered_extent(ordered); + ordered->file_offset <= last_pos) { unlock_extent_cached(&BTRFS_I(inode)->io_tree, - start_pos, last_pos - 1, - &cached_state, GFP_NOFS); + start_pos, last_pos, + cached_state, GFP_NOFS); for (i = 0; i < num_pages; i++) { unlock_page(pages[i]); page_cache_release(pages[i]); } - err = btrfs_wait_ordered_range(inode, start_pos, - last_pos - start_pos); - if (err) - goto fail; - goto again; + btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_put_ordered_extent(ordered); + return -EAGAIN; } if (ordered) btrfs_put_ordered_extent(ordered); clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos, - last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC | + last_pos, EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, - 0, 0, &cached_state, GFP_NOFS); - unlock_extent_cached(&BTRFS_I(inode)->io_tree, - start_pos, last_pos - 1, &cached_state, - GFP_NOFS); + 0, 0, cached_state, GFP_NOFS); + *lockstart = start_pos; + *lockend = last_pos; + ret = 1; } + for (i = 0; i < num_pages; i++) { if (clear_page_dirty_for_io(pages[i])) account_page_redirty(pages[i]); set_page_extent_mapped(pages[i]); WARN_ON(!PageLocked(pages[i])); } - return 0; -fail: - while (faili >= 0) { - unlock_page(pages[faili]); - page_cache_release(pages[faili]); - faili--; - } - return err; + return ret; } static noinline int check_can_nocow(struct inode *inode, loff_t pos, @@ -1342,8 +1428,12 @@ static noinline int check_can_nocow(struct inode *inode, loff_t pos, u64 num_bytes; int ret; + ret = btrfs_start_nocow_write(root); + if (!ret) + return -ENOSPC; + lockstart = round_down(pos, root->sectorsize); - lockend = lockstart + round_up(*write_bytes, root->sectorsize) - 1; + lockend = round_up(pos + *write_bytes, root->sectorsize) - 1; while (1) { lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend); @@ -1361,12 +1451,10 @@ static noinline int check_can_nocow(struct inode *inode, loff_t pos, ret = can_nocow_extent(inode, lockstart, &num_bytes, NULL, NULL, NULL); if (ret <= 0) { ret = 0; + btrfs_end_nocow_write(root); } else { - clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, - EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, - NULL, GFP_NOFS); - *write_bytes = min_t(size_t, *write_bytes, num_bytes); + *write_bytes = min_t(size_t, *write_bytes , + num_bytes - pos + lockstart); } unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend); @@ -1381,13 +1469,17 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, struct inode *inode = file_inode(file); struct btrfs_root *root = BTRFS_I(inode)->root; struct page **pages = NULL; + struct extent_state *cached_state = NULL; u64 release_bytes = 0; + u64 lockstart; + u64 lockend; unsigned long first_index; size_t num_written = 0; int nrptrs; int ret = 0; bool only_release_metadata = false; bool force_page_uptodate = false; + bool need_unlock; nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE, PAGE_CACHE_SIZE / @@ -1452,22 +1544,37 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, if (!only_release_metadata) btrfs_free_reserved_data_space(inode, reserve_bytes); + else + btrfs_end_nocow_write(root); break; } release_bytes = reserve_bytes; - + need_unlock = false; +again: /* * This is going to setup the pages array with the number of * pages we want, so we don't really need to worry about the * contents of pages from loop to loop */ - ret = prepare_pages(root, file, pages, num_pages, - pos, first_index, write_bytes, + ret = prepare_pages(inode, pages, num_pages, + pos, write_bytes, force_page_uptodate); if (ret) break; + ret = lock_and_cleanup_extent_if_need(inode, pages, num_pages, + pos, &lockstart, &lockend, + &cached_state); + if (ret < 0) { + if (ret == -EAGAIN) + goto again; + break; + } else if (ret > 0) { + need_unlock = true; + ret = 0; + } + copied = btrfs_copy_from_user(pos, num_pages, write_bytes, pages, i); @@ -1512,18 +1619,23 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, } release_bytes = dirty_pages << PAGE_CACHE_SHIFT; - if (copied > 0) { + + if (copied > 0) ret = btrfs_dirty_pages(root, inode, pages, dirty_pages, pos, copied, NULL); - if (ret) { - btrfs_drop_pages(pages, num_pages); - break; - } + if (need_unlock) + unlock_extent_cached(&BTRFS_I(inode)->io_tree, + lockstart, lockend, &cached_state, + GFP_NOFS); + if (ret) { + btrfs_drop_pages(pages, num_pages); + break; } release_bytes = 0; - btrfs_drop_pages(pages, num_pages); + if (only_release_metadata) + btrfs_end_nocow_write(root); if (only_release_metadata && copied > 0) { u64 lockstart = round_down(pos, root->sectorsize); @@ -1536,6 +1648,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, only_release_metadata = false; } + btrfs_drop_pages(pages, num_pages); + cond_resched(); balance_dirty_pages_ratelimited(inode->i_mapping); @@ -1549,37 +1663,34 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, kfree(pages); if (release_bytes) { - if (only_release_metadata) + if (only_release_metadata) { + btrfs_end_nocow_write(root); btrfs_delalloc_release_metadata(inode, release_bytes); - else + } else { btrfs_delalloc_release_space(inode, release_bytes); + } } return num_written ? num_written : ret; } static ssize_t __btrfs_direct_write(struct kiocb *iocb, - const struct iovec *iov, - unsigned long nr_segs, loff_t pos, - loff_t *ppos, size_t count, size_t ocount) + struct iov_iter *from, + loff_t pos) { struct file *file = iocb->ki_filp; - struct iov_iter i; ssize_t written; ssize_t written_buffered; loff_t endbyte; int err; - written = generic_file_direct_write(iocb, iov, &nr_segs, pos, ppos, - count, ocount); + written = generic_file_direct_write(iocb, from, pos); - if (written < 0 || written == count) + if (written < 0 || !iov_iter_count(from)) return written; pos += written; - count -= written; - iov_iter_init(&i, iov, nr_segs, count, written); - written_buffered = __btrfs_buffered_write(file, &i, pos); + written_buffered = __btrfs_buffered_write(file, from, pos); if (written_buffered < 0) { err = written_buffered; goto out; @@ -1589,7 +1700,7 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb, if (err) goto out; written += written_buffered; - *ppos = pos + written_buffered; + iocb->ki_pos = pos + written_buffered; invalidate_mapping_pages(file->f_mapping, pos >> PAGE_CACHE_SHIFT, endbyte >> PAGE_CACHE_SHIFT); out: @@ -1614,29 +1725,22 @@ static void update_time_for_write(struct inode *inode) inode_inc_iversion(inode); } -static ssize_t btrfs_file_aio_write(struct kiocb *iocb, - const struct iovec *iov, - unsigned long nr_segs, loff_t pos) +static ssize_t btrfs_file_write_iter(struct kiocb *iocb, + struct iov_iter *from) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); struct btrfs_root *root = BTRFS_I(inode)->root; - loff_t *ppos = &iocb->ki_pos; u64 start_pos; + u64 end_pos; ssize_t num_written = 0; ssize_t err = 0; - size_t count, ocount; + size_t count = iov_iter_count(from); bool sync = (file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host); + loff_t pos = iocb->ki_pos; mutex_lock(&inode->i_mutex); - err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ); - if (err) { - mutex_unlock(&inode->i_mutex); - goto out; - } - count = ocount; - current->backing_dev_info = inode->i_mapping->backing_dev_info; err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); if (err) { @@ -1649,6 +1753,8 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, goto out; } + iov_iter_truncate(from, count); + err = file_remove_suid(file); if (err) { mutex_unlock(&inode->i_mutex); @@ -1677,7 +1783,9 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, start_pos = round_down(pos, root->sectorsize); if (start_pos > i_size_read(inode)) { - err = btrfs_cont_expand(inode, i_size_read(inode), start_pos); + /* Expand hole size to cover write data, preventing empty gap */ + end_pos = round_up(pos + count, root->sectorsize); + err = btrfs_cont_expand(inode, i_size_read(inode), end_pos); if (err) { mutex_unlock(&inode->i_mutex); goto out; @@ -1688,16 +1796,11 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, atomic_inc(&BTRFS_I(inode)->sync_writers); if (unlikely(file->f_flags & O_DIRECT)) { - num_written = __btrfs_direct_write(iocb, iov, nr_segs, - pos, ppos, count, ocount); + num_written = __btrfs_direct_write(iocb, from, pos); } else { - struct iov_iter i; - - iov_iter_init(&i, iov, nr_segs, count, num_written); - - num_written = __btrfs_buffered_write(file, &i, pos); + num_written = __btrfs_buffered_write(file, from, pos); if (num_written > 0) - *ppos = pos + num_written; + iocb->ki_pos = pos + num_written; } mutex_unlock(&inode->i_mutex); @@ -1722,7 +1825,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, BTRFS_I(inode)->last_sub_trans = root->log_transid; if (num_written > 0) { err = generic_write_sync(file, pos, num_written); - if (err < 0 && num_written > 0) + if (err < 0) num_written = err; } @@ -1735,6 +1838,8 @@ out: int btrfs_release_file(struct inode *inode, struct file *filp) { + if (filp->private_data) + btrfs_ioctl_trans_end(filp); /* * ordered_data_close is set by settattr when we are about to truncate * a file from a non-zero size to a zero size. This tries to @@ -1742,26 +1847,8 @@ int btrfs_release_file(struct inode *inode, struct file *filp) * application were using truncate to replace a file in place. */ if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE, - &BTRFS_I(inode)->runtime_flags)) { - struct btrfs_trans_handle *trans; - struct btrfs_root *root = BTRFS_I(inode)->root; - - /* - * We need to block on a committing transaction to keep us from - * throwing a ordered operation on to the list and causing - * something like sync to deadlock trying to flush out this - * inode. - */ - trans = btrfs_start_transaction(root, 0); - if (IS_ERR(trans)) - return PTR_ERR(trans); - btrfs_add_ordered_operation(trans, BTRFS_I(inode)->root, inode); - btrfs_end_transaction(trans, root); - if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT) + &BTRFS_I(inode)->runtime_flags)) filemap_flush(inode->i_mapping); - } - if (filp->private_data) - btrfs_ioctl_trans_end(filp); return 0; } @@ -1781,8 +1868,9 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) struct dentry *dentry = file->f_path.dentry; struct inode *inode = dentry->d_inode; struct btrfs_root *root = BTRFS_I(inode)->root; - int ret = 0; struct btrfs_trans_handle *trans; + struct btrfs_log_ctx ctx; + int ret = 0; bool full_sync = 0; trace_btrfs_sync_file(file, datasync); @@ -1857,14 +1945,28 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) if (file->private_data) btrfs_ioctl_trans_end(file); + /* + * We use start here because we will need to wait on the IO to complete + * in btrfs_sync_log, which could require joining a transaction (for + * example checking cross references in the nocow path). If we use join + * here we could get into a situation where we're waiting on IO to + * happen that is blocked on a transaction trying to commit. With start + * we inc the extwriter counter, so we wait for all extwriters to exit + * before we start blocking join'ers. This comment is to keep somebody + * from thinking they are super smart and changing this to + * btrfs_join_transaction *cough*Josef*cough*. + */ trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); mutex_unlock(&inode->i_mutex); goto out; } + trans->sync = true; - ret = btrfs_log_dentry_safe(trans, root, dentry); + btrfs_init_log_ctx(&ctx); + + ret = btrfs_log_dentry_safe(trans, root, dentry, start, end, &ctx); if (ret < 0) { /* Fallthrough and commit/free transaction. */ ret = 1; @@ -1884,7 +1986,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) if (ret != BTRFS_NO_LOG_SYNC) { if (!ret) { - ret = btrfs_sync_log(trans, root); + ret = btrfs_sync_log(trans, root, &ctx); if (!ret) { ret = btrfs_end_transaction(trans, root); goto out; @@ -1893,8 +1995,10 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) if (!full_sync) { ret = btrfs_wait_ordered_range(inode, start, end - start + 1); - if (ret) + if (ret) { + btrfs_end_transaction(trans, root); goto out; + } } ret = btrfs_commit_transaction(trans, root); } else { @@ -1906,6 +2010,7 @@ out: static const struct vm_operations_struct btrfs_file_vm_ops = { .fault = filemap_fault, + .map_pages = filemap_map_pages, .page_mkwrite = btrfs_page_mkwrite, .remap_pages = generic_file_remap_pages, }; @@ -1963,11 +2068,13 @@ static int fill_holes(struct btrfs_trans_handle *trans, struct inode *inode, struct btrfs_key key; int ret; + if (btrfs_fs_incompat(root->fs_info, NO_HOLES)) + goto out; + key.objectid = btrfs_ino(inode); key.type = BTRFS_EXTENT_DATA_KEY; key.offset = offset; - ret = btrfs_search_slot(trans, root, &key, path, 0, 1); if (ret < 0) return ret; @@ -1989,10 +2096,9 @@ static int fill_holes(struct btrfs_trans_handle *trans, struct inode *inode, goto out; } - if (hole_mergeable(inode, leaf, path->slots[0]+1, offset, end)) { + if (hole_mergeable(inode, leaf, path->slots[0], offset, end)) { u64 num_bytes; - path->slots[0]++; key.offset = offset; btrfs_set_item_key_safe(root, path, &key); fi = btrfs_item_ptr(leaf, path->slots[0], @@ -2049,6 +2155,37 @@ out: return 0; } +/* + * Find a hole extent on given inode and change start/len to the end of hole + * extent.(hole/vacuum extent whose em->start <= start && + * em->start + em->len > start) + * When a hole extent is found, return 1 and modify start/len. + */ +static int find_first_non_hole(struct inode *inode, u64 *start, u64 *len) +{ + struct extent_map *em; + int ret = 0; + + em = btrfs_get_extent(inode, NULL, 0, *start, *len, 0); + if (IS_ERR_OR_NULL(em)) { + if (!em) + ret = -ENOMEM; + else + ret = PTR_ERR(em); + return ret; + } + + /* Hole or vacuum extent(only exists in no-hole mode) */ + if (em->block_start == EXTENT_MAP_HOLE) { + ret = 1; + *len = em->start + em->len > *start + *len ? + 0 : *start + *len - em->start - em->len; + *start = em->start + em->len; + } + free_extent_map(em); + return ret; +} + static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) { struct btrfs_root *root = BTRFS_I(inode)->root; @@ -2056,22 +2193,42 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) struct btrfs_path *path; struct btrfs_block_rsv *rsv; struct btrfs_trans_handle *trans; - u64 lockstart = round_up(offset, BTRFS_I(inode)->root->sectorsize); - u64 lockend = round_down(offset + len, - BTRFS_I(inode)->root->sectorsize) - 1; - u64 cur_offset = lockstart; + u64 lockstart; + u64 lockend; + u64 tail_start; + u64 tail_len; + u64 orig_start = offset; + u64 cur_offset; u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); u64 drop_end; int ret = 0; int err = 0; - bool same_page = ((offset >> PAGE_CACHE_SHIFT) == - ((offset + len - 1) >> PAGE_CACHE_SHIFT)); + int rsv_count; + bool same_page; + bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES); + u64 ino_size; ret = btrfs_wait_ordered_range(inode, offset, len); if (ret) return ret; mutex_lock(&inode->i_mutex); + ino_size = round_up(inode->i_size, PAGE_CACHE_SIZE); + ret = find_first_non_hole(inode, &offset, &len); + if (ret < 0) + goto out_only_mutex; + if (ret && !len) { + /* Already in a large hole */ + ret = 0; + goto out_only_mutex; + } + + lockstart = round_up(offset, BTRFS_I(inode)->root->sectorsize); + lockend = round_down(offset + len, + BTRFS_I(inode)->root->sectorsize) - 1; + same_page = ((offset >> PAGE_CACHE_SHIFT) == + ((offset + len - 1) >> PAGE_CACHE_SHIFT)); + /* * We needn't truncate any page which is beyond the end of the file * because we are sure there is no data there. @@ -2081,14 +2238,13 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) * entire page. */ if (same_page && len < PAGE_CACHE_SIZE) { - if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE)) + if (offset < ino_size) ret = btrfs_truncate_page(inode, offset, len, 0); - mutex_unlock(&inode->i_mutex); - return ret; + goto out_only_mutex; } /* zero back part of the first page */ - if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE)) { + if (offset < ino_size) { ret = btrfs_truncate_page(inode, offset, 0, 0); if (ret) { mutex_unlock(&inode->i_mutex); @@ -2096,12 +2252,39 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) } } - /* zero the front end of the last page */ - if (offset + len < round_up(inode->i_size, PAGE_CACHE_SIZE)) { - ret = btrfs_truncate_page(inode, offset + len, 0, 1); - if (ret) { - mutex_unlock(&inode->i_mutex); - return ret; + /* Check the aligned pages after the first unaligned page, + * if offset != orig_start, which means the first unaligned page + * including serveral following pages are already in holes, + * the extra check can be skipped */ + if (offset == orig_start) { + /* after truncate page, check hole again */ + len = offset + len - lockstart; + offset = lockstart; + ret = find_first_non_hole(inode, &offset, &len); + if (ret < 0) + goto out_only_mutex; + if (ret && !len) { + ret = 0; + goto out_only_mutex; + } + lockstart = offset; + } + + /* Check the tail unaligned part is in a hole */ + tail_start = lockend + 1; + tail_len = offset + len - tail_start; + if (tail_len) { + ret = find_first_non_hole(inode, &tail_start, &tail_len); + if (unlikely(ret < 0)) + goto out_only_mutex; + if (!ret) { + /* zero the front end of the last page */ + if (tail_start + tail_len < ino_size) { + ret = btrfs_truncate_page(inode, + tail_start + tail_len, 0, 1); + if (ret) + goto out_only_mutex; + } } } @@ -2125,11 +2308,9 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) * we need to try again. */ if ((!ordered || - (ordered->file_offset + ordered->len < lockstart || + (ordered->file_offset + ordered->len <= lockstart || ordered->file_offset > lockend)) && - !test_range_bit(&BTRFS_I(inode)->io_tree, lockstart, - lockend, EXTENT_UPTODATE, 0, - cached_state)) { + !btrfs_page_exists_in_range(inode, lockstart, lockend)) { if (ordered) btrfs_put_ordered_extent(ordered); break; @@ -2163,9 +2344,10 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) /* * 1 - update the inode * 1 - removing the extents in the range - * 1 - adding the hole extent + * 1 - adding the hole extent if no_holes isn't set */ - trans = btrfs_start_transaction(root, 3); + rsv_count = no_holes ? 2 : 3; + trans = btrfs_start_transaction(root, rsv_count); if (IS_ERR(trans)) { err = PTR_ERR(trans); goto out_free; @@ -2176,19 +2358,24 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) BUG_ON(ret); trans->block_rsv = rsv; + cur_offset = lockstart; + len = lockend - cur_offset; while (cur_offset < lockend) { ret = __btrfs_drop_extents(trans, root, inode, path, cur_offset, lockend + 1, - &drop_end, 1); + &drop_end, 1, 0, 0, NULL); if (ret != -ENOSPC) break; trans->block_rsv = &root->fs_info->trans_block_rsv; - ret = fill_holes(trans, inode, path, cur_offset, drop_end); - if (ret) { - err = ret; - break; + if (cur_offset < ino_size) { + ret = fill_holes(trans, inode, path, cur_offset, + drop_end); + if (ret) { + err = ret; + break; + } } cur_offset = drop_end; @@ -2202,7 +2389,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) btrfs_end_transaction(trans, root); btrfs_btree_balance_dirty(root); - trans = btrfs_start_transaction(root, 3); + trans = btrfs_start_transaction(root, rsv_count); if (IS_ERR(trans)) { ret = PTR_ERR(trans); trans = NULL; @@ -2213,6 +2400,14 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) rsv, min_size); BUG_ON(ret); /* shouldn't happen */ trans->block_rsv = rsv; + + ret = find_first_non_hole(inode, &cur_offset, &len); + if (unlikely(ret < 0)) + break; + if (ret && !len) { + ret = 0; + break; + } } if (ret) { @@ -2221,10 +2416,17 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) } trans->block_rsv = &root->fs_info->trans_block_rsv; - ret = fill_holes(trans, inode, path, cur_offset, drop_end); - if (ret) { - err = ret; - goto out_trans; + /* + * Don't insert file hole extent item if it's for a range beyond eof + * (because it's useless) or if it represents a 0 bytes range (when + * cur_offset == drop_end). + */ + if (cur_offset < ino_size && cur_offset < drop_end) { + ret = fill_holes(trans, inode, path, cur_offset, drop_end); + if (ret) { + err = ret; + goto out_trans; + } } out_trans: @@ -2244,6 +2446,7 @@ out_free: out: unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, &cached_state, GFP_NOFS); +out_only_mutex: mutex_unlock(&inode->i_mutex); if (ret && !err) err = ret; @@ -2505,11 +2708,11 @@ out: const struct file_operations btrfs_file_operations = { .llseek = btrfs_file_llseek, - .read = do_sync_read, - .write = do_sync_write, - .aio_read = generic_file_aio_read, + .read = new_sync_read, + .write = new_sync_write, + .read_iter = generic_file_read_iter, .splice_read = generic_file_splice_read, - .aio_write = btrfs_file_aio_write, + .write_iter = btrfs_file_write_iter, .mmap = btrfs_file_mmap, .open = generic_file_open, .release = btrfs_release_file, |