summaryrefslogtreecommitdiff
path: root/fs/btrfs/file.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs/file.c')
-rw-r--r--fs/btrfs/file.c1095
1 files changed, 873 insertions, 222 deletions
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 9aa01ec2138d..a005fe2c072a 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -24,22 +24,25 @@
#include <linux/string.h>
#include <linux/backing-dev.h>
#include <linux/mpage.h>
+#include <linux/aio.h>
#include <linux/falloc.h>
#include <linux/swap.h>
#include <linux/writeback.h>
#include <linux/statfs.h>
#include <linux/compat.h>
#include <linux/slab.h>
+#include <linux/btrfs.h>
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
#include "btrfs_inode.h"
-#include "ioctl.h"
#include "print-tree.h"
#include "tree-log.h"
#include "locking.h"
#include "compat.h"
+#include "volumes.h"
+static struct kmem_cache *btrfs_inode_defrag_cachep;
/*
* when auto defrag is enabled we
* queue up these defrag structs to remember which
@@ -89,7 +92,7 @@ static int __compare_inode_defrag(struct inode_defrag *defrag1,
* If an existing record is found the defrag item you
* pass in is freed
*/
-static void __btrfs_add_inode_defrag(struct inode *inode,
+static int __btrfs_add_inode_defrag(struct inode *inode,
struct inode_defrag *defrag)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -117,18 +120,24 @@ static void __btrfs_add_inode_defrag(struct inode *inode,
entry->transid = defrag->transid;
if (defrag->last_offset > entry->last_offset)
entry->last_offset = defrag->last_offset;
- goto exists;
+ return -EEXIST;
}
}
set_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
rb_link_node(&defrag->rb_node, parent, p);
rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes);
- return;
+ return 0;
+}
-exists:
- kfree(defrag);
- return;
+static inline int __need_auto_defrag(struct btrfs_root *root)
+{
+ if (!btrfs_test_opt(root, AUTO_DEFRAG))
+ return 0;
+ if (btrfs_fs_closing(root->fs_info))
+ return 0;
+
+ return 1;
}
/*
@@ -141,11 +150,9 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
struct btrfs_root *root = BTRFS_I(inode)->root;
struct inode_defrag *defrag;
u64 transid;
+ int ret;
- if (!btrfs_test_opt(root, AUTO_DEFRAG))
- return 0;
-
- if (btrfs_fs_closing(root->fs_info))
+ if (!__need_auto_defrag(root))
return 0;
if (test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags))
@@ -156,7 +163,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
else
transid = BTRFS_I(inode)->root->last_trans;
- defrag = kzalloc(sizeof(*defrag), GFP_NOFS);
+ defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS);
if (!defrag)
return -ENOMEM;
@@ -165,20 +172,56 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
defrag->root = root->root_key.objectid;
spin_lock(&root->fs_info->defrag_inodes_lock);
- if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags))
- __btrfs_add_inode_defrag(inode, defrag);
- else
- kfree(defrag);
+ if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) {
+ /*
+ * If we set IN_DEFRAG flag and evict the inode from memory,
+ * and then re-read this inode, this new inode doesn't have
+ * IN_DEFRAG flag. At the case, we may find the existed defrag.
+ */
+ ret = __btrfs_add_inode_defrag(inode, defrag);
+ if (ret)
+ kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+ } else {
+ kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+ }
spin_unlock(&root->fs_info->defrag_inodes_lock);
return 0;
}
/*
- * must be called with the defrag_inodes lock held
+ * Requeue the defrag object. If there is a defrag object that points to
+ * the same inode in the tree, we will merge them together (by
+ * __btrfs_add_inode_defrag()) and free the one that we want to requeue.
*/
-struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info,
- u64 root, u64 ino,
- struct rb_node **next)
+static void btrfs_requeue_inode_defrag(struct inode *inode,
+ struct inode_defrag *defrag)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ int ret;
+
+ if (!__need_auto_defrag(root))
+ goto out;
+
+ /*
+ * Here we don't check the IN_DEFRAG flag, because we need merge
+ * them together.
+ */
+ spin_lock(&root->fs_info->defrag_inodes_lock);
+ ret = __btrfs_add_inode_defrag(inode, defrag);
+ spin_unlock(&root->fs_info->defrag_inodes_lock);
+ if (ret)
+ goto out;
+ return;
+out:
+ kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+}
+
+/*
+ * pick the defragable inode that we want, if it doesn't exist, we will get
+ * the next one.
+ */
+static struct inode_defrag *
+btrfs_pick_defrag_inode(struct btrfs_fs_info *fs_info, u64 root, u64 ino)
{
struct inode_defrag *entry = NULL;
struct inode_defrag tmp;
@@ -189,7 +232,8 @@ struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info,
tmp.ino = ino;
tmp.root = root;
- p = info->defrag_inodes.rb_node;
+ spin_lock(&fs_info->defrag_inodes_lock);
+ p = fs_info->defrag_inodes.rb_node;
while (p) {
parent = p;
entry = rb_entry(parent, struct inode_defrag, rb_node);
@@ -200,52 +244,146 @@ struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info,
else if (ret > 0)
p = parent->rb_right;
else
- return entry;
+ goto out;
}
- if (next) {
- while (parent && __compare_inode_defrag(&tmp, entry) > 0) {
- parent = rb_next(parent);
+ if (parent && __compare_inode_defrag(&tmp, entry) > 0) {
+ parent = rb_next(parent);
+ if (parent)
entry = rb_entry(parent, struct inode_defrag, rb_node);
- }
- *next = parent;
+ else
+ entry = NULL;
}
- return NULL;
+out:
+ if (entry)
+ rb_erase(parent, &fs_info->defrag_inodes);
+ spin_unlock(&fs_info->defrag_inodes_lock);
+ return entry;
}
-/*
- * run through the list of inodes in the FS that need
- * defragging
- */
-int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
+void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info)
{
struct inode_defrag *defrag;
+ struct rb_node *node;
+
+ spin_lock(&fs_info->defrag_inodes_lock);
+ node = rb_first(&fs_info->defrag_inodes);
+ while (node) {
+ rb_erase(node, &fs_info->defrag_inodes);
+ defrag = rb_entry(node, struct inode_defrag, rb_node);
+ kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+
+ if (need_resched()) {
+ spin_unlock(&fs_info->defrag_inodes_lock);
+ cond_resched();
+ spin_lock(&fs_info->defrag_inodes_lock);
+ }
+
+ node = rb_first(&fs_info->defrag_inodes);
+ }
+ spin_unlock(&fs_info->defrag_inodes_lock);
+}
+
+#define BTRFS_DEFRAG_BATCH 1024
+
+static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
+ struct inode_defrag *defrag)
+{
struct btrfs_root *inode_root;
struct inode *inode;
- struct rb_node *n;
struct btrfs_key key;
struct btrfs_ioctl_defrag_range_args range;
- u64 first_ino = 0;
- u64 root_objectid = 0;
int num_defrag;
- int defrag_batch = 1024;
+ int index;
+ int ret;
+
+ /* get the inode */
+ key.objectid = defrag->root;
+ btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+ key.offset = (u64)-1;
+ index = srcu_read_lock(&fs_info->subvol_srcu);
+
+ inode_root = btrfs_read_fs_root_no_name(fs_info, &key);
+ if (IS_ERR(inode_root)) {
+ ret = PTR_ERR(inode_root);
+ goto cleanup;
+ }
+
+ key.objectid = defrag->ino;
+ btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+ key.offset = 0;
+ inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL);
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
+ goto cleanup;
+ }
+ srcu_read_unlock(&fs_info->subvol_srcu, index);
+
+ /* do a chunk of defrag */
+ clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
memset(&range, 0, sizeof(range));
range.len = (u64)-1;
+ range.start = defrag->last_offset;
+
+ sb_start_write(fs_info->sb);
+ num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
+ BTRFS_DEFRAG_BATCH);
+ sb_end_write(fs_info->sb);
+ /*
+ * if we filled the whole defrag batch, there
+ * must be more work to do. Queue this defrag
+ * again
+ */
+ if (num_defrag == BTRFS_DEFRAG_BATCH) {
+ defrag->last_offset = range.start;
+ btrfs_requeue_inode_defrag(inode, defrag);
+ } else if (defrag->last_offset && !defrag->cycled) {
+ /*
+ * we didn't fill our defrag batch, but
+ * we didn't start at zero. Make sure we loop
+ * around to the start of the file.
+ */
+ defrag->last_offset = 0;
+ defrag->cycled = 1;
+ btrfs_requeue_inode_defrag(inode, defrag);
+ } else {
+ kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+ }
+
+ iput(inode);
+ return 0;
+cleanup:
+ srcu_read_unlock(&fs_info->subvol_srcu, index);
+ kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+ return ret;
+}
+
+/*
+ * run through the list of inodes in the FS that need
+ * defragging
+ */
+int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
+{
+ struct inode_defrag *defrag;
+ u64 first_ino = 0;
+ u64 root_objectid = 0;
atomic_inc(&fs_info->defrag_running);
- spin_lock(&fs_info->defrag_inodes_lock);
while(1) {
- n = NULL;
+ /* Pause the auto defragger. */
+ if (test_bit(BTRFS_FS_STATE_REMOUNTING,
+ &fs_info->fs_state))
+ break;
+
+ if (!__need_auto_defrag(fs_info->tree_root))
+ break;
/* find an inode to defrag */
- defrag = btrfs_find_defrag_inode(fs_info, root_objectid,
- first_ino, &n);
+ defrag = btrfs_pick_defrag_inode(fs_info, root_objectid,
+ first_ino);
if (!defrag) {
- if (n) {
- defrag = rb_entry(n, struct inode_defrag,
- rb_node);
- } else if (root_objectid || first_ino) {
+ if (root_objectid || first_ino) {
root_objectid = 0;
first_ino = 0;
continue;
@@ -254,70 +392,11 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
}
}
- /* remove it from the rbtree */
first_ino = defrag->ino + 1;
root_objectid = defrag->root;
- rb_erase(&defrag->rb_node, &fs_info->defrag_inodes);
-
- if (btrfs_fs_closing(fs_info))
- goto next_free;
-
- spin_unlock(&fs_info->defrag_inodes_lock);
-
- /* get the inode */
- key.objectid = defrag->root;
- btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
- key.offset = (u64)-1;
- inode_root = btrfs_read_fs_root_no_name(fs_info, &key);
- if (IS_ERR(inode_root))
- goto next;
-
- key.objectid = defrag->ino;
- btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
- key.offset = 0;
-
- inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL);
- if (IS_ERR(inode))
- goto next;
-
- /* do a chunk of defrag */
- clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
- range.start = defrag->last_offset;
- num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
- defrag_batch);
- /*
- * if we filled the whole defrag batch, there
- * must be more work to do. Queue this defrag
- * again
- */
- if (num_defrag == defrag_batch) {
- defrag->last_offset = range.start;
- __btrfs_add_inode_defrag(inode, defrag);
- /*
- * we don't want to kfree defrag, we added it back to
- * the rbtree
- */
- defrag = NULL;
- } else if (defrag->last_offset && !defrag->cycled) {
- /*
- * we didn't fill our defrag batch, but
- * we didn't start at zero. Make sure we loop
- * around to the start of the file.
- */
- defrag->last_offset = 0;
- defrag->cycled = 1;
- __btrfs_add_inode_defrag(inode, defrag);
- defrag = NULL;
- }
- iput(inode);
-next:
- spin_lock(&fs_info->defrag_inodes_lock);
-next_free:
- kfree(defrag);
+ __btrfs_run_defrag_inode(fs_info, defrag);
}
- spin_unlock(&fs_info->defrag_inodes_lock);
-
atomic_dec(&fs_info->defrag_running);
/*
@@ -391,7 +470,7 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
/*
* unlocks pages after btrfs_file_write is done with them
*/
-void btrfs_drop_pages(struct page **pages, size_t num_pages)
+static void btrfs_drop_pages(struct page **pages, size_t num_pages)
{
size_t i;
for (i = 0; i < num_pages; i++) {
@@ -415,9 +494,9 @@ void btrfs_drop_pages(struct page **pages, size_t num_pages)
* doing real data extents, marking pages dirty and delalloc as required.
*/
int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
- struct page **pages, size_t num_pages,
- loff_t pos, size_t write_bytes,
- struct extent_state **cached)
+ struct page **pages, size_t num_pages,
+ loff_t pos, size_t write_bytes,
+ struct extent_state **cached)
{
int err = 0;
int i;
@@ -428,8 +507,7 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
loff_t isize = i_size_read(inode);
start_pos = pos & ~((u64)root->sectorsize - 1);
- num_bytes = (write_bytes + pos - start_pos +
- root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
+ num_bytes = ALIGN(write_bytes + pos - start_pos, root->sectorsize);
end_of_last_block = start_pos + num_bytes - 1;
err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
@@ -458,18 +536,20 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
* this drops all the extents in the cache that intersect the range
* [start, end]. Existing extents are split as required.
*/
-int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
- int skip_pinned)
+void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
+ int skip_pinned)
{
struct extent_map *em;
struct extent_map *split = NULL;
struct extent_map *split2 = NULL;
struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
u64 len = end - start + 1;
+ u64 gen;
int ret;
int testend = 1;
unsigned long flags;
int compressed = 0;
+ bool modified;
WARN_ON(end < start);
if (end == (u64)-1) {
@@ -477,11 +557,15 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
testend = 0;
}
while (1) {
+ int no_splits = 0;
+
+ modified = false;
if (!split)
split = alloc_extent_map();
if (!split2)
split2 = alloc_extent_map();
- BUG_ON(!split || !split2); /* -ENOMEM */
+ if (!split || !split2)
+ no_splits = 1;
write_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, start, len);
@@ -490,6 +574,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
break;
}
flags = em->flags;
+ gen = em->generation;
if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
if (testend && em->start + em->len >= start + len) {
free_extent_map(em);
@@ -505,7 +590,11 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
}
compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
clear_bit(EXTENT_FLAG_PINNED, &em->flags);
+ clear_bit(EXTENT_FLAG_LOGGING, &flags);
+ modified = !list_empty(&em->list);
remove_extent_mapping(em_tree, em);
+ if (no_splits)
+ goto next;
if (em->block_start < EXTENT_MAP_LAST_BYTE &&
em->start < start) {
@@ -518,11 +607,14 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
split->block_len = em->block_len;
else
split->block_len = split->len;
-
+ split->ram_bytes = em->ram_bytes;
+ split->orig_block_len = max(split->block_len,
+ em->orig_block_len);
+ split->generation = gen;
split->bdev = em->bdev;
split->flags = flags;
split->compress_type = em->compress_type;
- ret = add_extent_mapping(em_tree, split);
+ ret = add_extent_mapping(em_tree, split, modified);
BUG_ON(ret); /* Logic error */
free_extent_map(split);
split = split2;
@@ -537,6 +629,10 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
split->bdev = em->bdev;
split->flags = flags;
split->compress_type = em->compress_type;
+ split->generation = gen;
+ split->orig_block_len = max(em->block_len,
+ em->orig_block_len);
+ split->ram_bytes = em->ram_bytes;
if (compressed) {
split->block_len = em->block_len;
@@ -545,14 +641,15 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
} else {
split->block_len = split->len;
split->block_start = em->block_start + diff;
- split->orig_start = split->start;
+ split->orig_start = em->orig_start;
}
- ret = add_extent_mapping(em_tree, split);
+ ret = add_extent_mapping(em_tree, split, modified);
BUG_ON(ret); /* Logic error */
free_extent_map(split);
split = NULL;
}
+next:
write_unlock(&em_tree->lock);
/* once for us */
@@ -564,7 +661,6 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
free_extent_map(split);
if (split2)
free_extent_map(split2);
- return 0;
}
/*
@@ -576,13 +672,13 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
* it is either truncated or split. Anything entirely inside the range
* is deleted from the tree.
*/
-int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode,
- u64 start, u64 end, u64 *hint_byte, int drop_cache)
+int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct inode *inode,
+ struct btrfs_path *path, u64 start, u64 end,
+ u64 *drop_end, int drop_cache)
{
- struct btrfs_root *root = BTRFS_I(inode)->root;
struct extent_buffer *leaf;
struct btrfs_file_extent_item *fi;
- struct btrfs_path *path;
struct btrfs_key key;
struct btrfs_key new_key;
u64 ino = btrfs_ino(inode);
@@ -597,14 +693,12 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode,
int recow;
int ret;
int modify_tree = -1;
+ int update_refs = (root->ref_cows || root == root->fs_info->tree_root);
+ int found = 0;
if (drop_cache)
btrfs_drop_extent_cache(inode, start, end - 1, 0);
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
-
if (start >= BTRFS_I(inode)->disk_i_size)
modify_tree = 0;
@@ -666,6 +760,7 @@ next_slot:
goto next_slot;
}
+ found = 1;
search_start = max(key.offset, start);
if (recow || !modify_tree) {
modify_tree = -1;
@@ -707,14 +802,13 @@ next_slot:
extent_end - start);
btrfs_mark_buffer_dirty(leaf);
- if (disk_bytenr > 0) {
+ if (update_refs && disk_bytenr > 0) {
ret = btrfs_inc_extent_ref(trans, root,
disk_bytenr, num_bytes, 0,
root->root_key.objectid,
new_key.objectid,
start - extent_offset, 0);
BUG_ON(ret); /* -ENOMEM */
- *hint_byte = disk_bytenr;
}
key.offset = start;
}
@@ -727,17 +821,15 @@ next_slot:
memcpy(&new_key, &key, sizeof(new_key));
new_key.offset = end;
- btrfs_set_item_key_safe(trans, root, path, &new_key);
+ btrfs_set_item_key_safe(root, path, &new_key);
extent_offset += end - key.offset;
btrfs_set_file_extent_offset(leaf, fi, extent_offset);
btrfs_set_file_extent_num_bytes(leaf, fi,
extent_end - end);
btrfs_mark_buffer_dirty(leaf);
- if (disk_bytenr > 0) {
+ if (update_refs && disk_bytenr > 0)
inode_sub_bytes(inode, end - key.offset);
- *hint_byte = disk_bytenr;
- }
break;
}
@@ -753,10 +845,8 @@ next_slot:
btrfs_set_file_extent_num_bytes(leaf, fi,
start - key.offset);
btrfs_mark_buffer_dirty(leaf);
- if (disk_bytenr > 0) {
+ if (update_refs && disk_bytenr > 0)
inode_sub_bytes(inode, extent_end - start);
- *hint_byte = disk_bytenr;
- }
if (end == extent_end)
break;
@@ -777,12 +867,13 @@ next_slot:
del_nr++;
}
- if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+ if (update_refs &&
+ extent_type == BTRFS_FILE_EXTENT_INLINE) {
inode_sub_bytes(inode,
extent_end - key.offset);
extent_end = ALIGN(extent_end,
root->sectorsize);
- } else if (disk_bytenr > 0) {
+ } else if (update_refs && disk_bytenr > 0) {
ret = btrfs_free_extent(trans, root,
disk_bytenr, num_bytes, 0,
root->root_key.objectid,
@@ -791,7 +882,6 @@ next_slot:
BUG_ON(ret); /* -ENOMEM */
inode_sub_bytes(inode,
extent_end - key.offset);
- *hint_byte = disk_bytenr;
}
if (end == extent_end)
@@ -806,7 +896,7 @@ next_slot:
del_nr);
if (ret) {
btrfs_abort_transaction(trans, root, ret);
- goto out;
+ break;
}
del_nr = 0;
@@ -825,7 +915,24 @@ next_slot:
btrfs_abort_transaction(trans, root, ret);
}
-out:
+ if (drop_end)
+ *drop_end = found ? min(end, extent_end) : end;
+ btrfs_release_path(path);
+ return ret;
+}
+
+int btrfs_drop_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct inode *inode, u64 start,
+ u64 end, int drop_cache)
+{
+ struct btrfs_path *path;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+ ret = __btrfs_drop_extents(trans, root, inode, path, start, end, NULL,
+ drop_cache);
btrfs_free_path(path);
return ret;
}
@@ -892,8 +999,6 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
int ret;
u64 ino = btrfs_ino(inode);
- btrfs_drop_extent_cache(inode, start, end - 1, 0);
-
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -932,15 +1037,19 @@ again:
ino, bytenr, orig_offset,
&other_start, &other_end)) {
new_key.offset = end;
- btrfs_set_item_key_safe(trans, root, path, &new_key);
+ btrfs_set_item_key_safe(root, path, &new_key);
fi = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
+ btrfs_set_file_extent_generation(leaf, fi,
+ trans->transid);
btrfs_set_file_extent_num_bytes(leaf, fi,
extent_end - end);
btrfs_set_file_extent_offset(leaf, fi,
end - orig_offset);
fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
struct btrfs_file_extent_item);
+ btrfs_set_file_extent_generation(leaf, fi,
+ trans->transid);
btrfs_set_file_extent_num_bytes(leaf, fi,
end - other_start);
btrfs_mark_buffer_dirty(leaf);
@@ -958,12 +1067,16 @@ again:
struct btrfs_file_extent_item);
btrfs_set_file_extent_num_bytes(leaf, fi,
start - key.offset);
+ btrfs_set_file_extent_generation(leaf, fi,
+ trans->transid);
path->slots[0]++;
new_key.offset = start;
- btrfs_set_item_key_safe(trans, root, path, &new_key);
+ btrfs_set_item_key_safe(root, path, &new_key);
fi = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
+ btrfs_set_file_extent_generation(leaf, fi,
+ trans->transid);
btrfs_set_file_extent_num_bytes(leaf, fi,
other_end - start);
btrfs_set_file_extent_offset(leaf, fi,
@@ -991,12 +1104,14 @@ again:
leaf = path->nodes[0];
fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
struct btrfs_file_extent_item);
+ btrfs_set_file_extent_generation(leaf, fi, trans->transid);
btrfs_set_file_extent_num_bytes(leaf, fi,
split - key.offset);
fi = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
+ btrfs_set_file_extent_generation(leaf, fi, trans->transid);
btrfs_set_file_extent_offset(leaf, fi, split - orig_offset);
btrfs_set_file_extent_num_bytes(leaf, fi,
extent_end - split);
@@ -1056,12 +1171,14 @@ again:
struct btrfs_file_extent_item);
btrfs_set_file_extent_type(leaf, fi,
BTRFS_FILE_EXTENT_REG);
+ btrfs_set_file_extent_generation(leaf, fi, trans->transid);
btrfs_mark_buffer_dirty(leaf);
} else {
fi = btrfs_item_ptr(leaf, del_slot - 1,
struct btrfs_file_extent_item);
btrfs_set_file_extent_type(leaf, fi,
BTRFS_FILE_EXTENT_REG);
+ btrfs_set_file_extent_generation(leaf, fi, trans->transid);
btrfs_set_file_extent_num_bytes(leaf, fi,
extent_end - key.offset);
btrfs_mark_buffer_dirty(leaf);
@@ -1113,7 +1230,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
struct extent_state *cached_state = NULL;
int i;
unsigned long index = pos >> PAGE_CACHE_SHIFT;
- struct inode *inode = fdentry(file)->d_inode;
+ struct inode *inode = file_inode(file);
gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
int err = 0;
int faili = 0;
@@ -1173,8 +1290,8 @@ again:
clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos,
last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
- EXTENT_DO_ACCOUNTING, 0, 0, &cached_state,
- GFP_NOFS);
+ EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
+ 0, 0, &cached_state, GFP_NOFS);
unlock_extent_cached(&BTRFS_I(inode)->io_tree,
start_pos, last_pos - 1, &cached_state,
GFP_NOFS);
@@ -1196,17 +1313,69 @@ fail:
}
+static noinline int check_can_nocow(struct inode *inode, loff_t pos,
+ size_t *write_bytes)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_ordered_extent *ordered;
+ u64 lockstart, lockend;
+ u64 num_bytes;
+ int ret;
+
+ lockstart = round_down(pos, root->sectorsize);
+ lockend = lockstart + round_up(*write_bytes, root->sectorsize) - 1;
+
+ while (1) {
+ lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
+ ordered = btrfs_lookup_ordered_range(inode, lockstart,
+ lockend - lockstart + 1);
+ if (!ordered) {
+ break;
+ }
+ unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
+ btrfs_start_ordered_extent(inode, ordered, 1);
+ btrfs_put_ordered_extent(ordered);
+ }
+
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans)) {
+ unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
+ return PTR_ERR(trans);
+ }
+
+ num_bytes = lockend - lockstart + 1;
+ ret = can_nocow_extent(trans, inode, lockstart, &num_bytes, NULL, NULL,
+ NULL);
+ btrfs_end_transaction(trans, root);
+ if (ret <= 0) {
+ ret = 0;
+ } else {
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ EXTENT_DIRTY | EXTENT_DELALLOC |
+ EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0,
+ NULL, GFP_NOFS);
+ *write_bytes = min_t(size_t, *write_bytes, num_bytes);
+ }
+
+ unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
+
+ return ret;
+}
+
static noinline ssize_t __btrfs_buffered_write(struct file *file,
struct iov_iter *i,
loff_t pos)
{
- struct inode *inode = fdentry(file)->d_inode;
+ struct inode *inode = file_inode(file);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct page **pages = NULL;
+ u64 release_bytes = 0;
unsigned long first_index;
size_t num_written = 0;
int nrptrs;
int ret = 0;
+ bool only_release_metadata = false;
bool force_page_uptodate = false;
nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) /
@@ -1227,6 +1396,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
offset);
size_t num_pages = (write_bytes + offset +
PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ size_t reserve_bytes;
size_t dirty_pages;
size_t copied;
@@ -1241,11 +1411,41 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
break;
}
- ret = btrfs_delalloc_reserve_space(inode,
- num_pages << PAGE_CACHE_SHIFT);
+ reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
+ ret = btrfs_check_data_free_space(inode, reserve_bytes);
+ if (ret == -ENOSPC &&
+ (BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
+ BTRFS_INODE_PREALLOC))) {
+ ret = check_can_nocow(inode, pos, &write_bytes);
+ if (ret > 0) {
+ only_release_metadata = true;
+ /*
+ * our prealloc extent may be smaller than
+ * write_bytes, so scale down.
+ */
+ num_pages = (write_bytes + offset +
+ PAGE_CACHE_SIZE - 1) >>
+ PAGE_CACHE_SHIFT;
+ reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
+ ret = 0;
+ } else {
+ ret = -ENOSPC;
+ }
+ }
+
if (ret)
break;
+ ret = btrfs_delalloc_reserve_metadata(inode, reserve_bytes);
+ if (ret) {
+ if (!only_release_metadata)
+ btrfs_free_reserved_data_space(inode,
+ reserve_bytes);
+ break;
+ }
+
+ release_bytes = reserve_bytes;
+
/*
* This is going to setup the pages array with the number of
* pages we want, so we don't really need to worry about the
@@ -1254,11 +1454,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
ret = prepare_pages(root, file, pages, num_pages,
pos, first_index, write_bytes,
force_page_uptodate);
- if (ret) {
- btrfs_delalloc_release_space(inode,
- num_pages << PAGE_CACHE_SHIFT);
+ if (ret)
break;
- }
copied = btrfs_copy_from_user(pos, num_pages,
write_bytes, pages, i);
@@ -1288,36 +1485,51 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
* managed to copy.
*/
if (num_pages > dirty_pages) {
+ release_bytes = (num_pages - dirty_pages) <<
+ PAGE_CACHE_SHIFT;
if (copied > 0) {
spin_lock(&BTRFS_I(inode)->lock);
BTRFS_I(inode)->outstanding_extents++;
spin_unlock(&BTRFS_I(inode)->lock);
}
- btrfs_delalloc_release_space(inode,
- (num_pages - dirty_pages) <<
- PAGE_CACHE_SHIFT);
+ if (only_release_metadata)
+ btrfs_delalloc_release_metadata(inode,
+ release_bytes);
+ else
+ btrfs_delalloc_release_space(inode,
+ release_bytes);
}
+ release_bytes = dirty_pages << PAGE_CACHE_SHIFT;
if (copied > 0) {
ret = btrfs_dirty_pages(root, inode, pages,
dirty_pages, pos, copied,
NULL);
if (ret) {
- btrfs_delalloc_release_space(inode,
- dirty_pages << PAGE_CACHE_SHIFT);
btrfs_drop_pages(pages, num_pages);
break;
}
}
+ release_bytes = 0;
btrfs_drop_pages(pages, num_pages);
+ if (only_release_metadata && copied > 0) {
+ u64 lockstart = round_down(pos, root->sectorsize);
+ u64 lockend = lockstart +
+ (dirty_pages << PAGE_CACHE_SHIFT) - 1;
+
+ set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
+ lockend, EXTENT_NORESERVE, NULL,
+ NULL, GFP_NOFS);
+ only_release_metadata = false;
+ }
+
cond_resched();
- balance_dirty_pages_ratelimited_nr(inode->i_mapping,
- dirty_pages);
+ balance_dirty_pages_ratelimited(inode->i_mapping);
if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
- btrfs_btree_balance_dirty(root, 1);
+ btrfs_btree_balance_dirty(root);
pos += copied;
num_written += copied;
@@ -1325,6 +1537,13 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
kfree(pages);
+ if (release_bytes) {
+ if (only_release_metadata)
+ btrfs_delalloc_release_metadata(inode, release_bytes);
+ else
+ btrfs_delalloc_release_space(inode, release_bytes);
+ }
+
return num_written ? num_written : ret;
}
@@ -1366,20 +1585,37 @@ out:
return written ? written : err;
}
+static void update_time_for_write(struct inode *inode)
+{
+ struct timespec now;
+
+ if (IS_NOCMTIME(inode))
+ return;
+
+ now = current_fs_time(inode->i_sb);
+ if (!timespec_equal(&inode->i_mtime, &now))
+ inode->i_mtime = now;
+
+ if (!timespec_equal(&inode->i_ctime, &now))
+ inode->i_ctime = now;
+
+ if (IS_I_VERSION(inode))
+ inode_inc_iversion(inode);
+}
+
static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
const struct iovec *iov,
unsigned long nr_segs, loff_t pos)
{
struct file *file = iocb->ki_filp;
- struct inode *inode = fdentry(file)->d_inode;
+ struct inode *inode = file_inode(file);
struct btrfs_root *root = BTRFS_I(inode)->root;
loff_t *ppos = &iocb->ki_pos;
u64 start_pos;
ssize_t num_written = 0;
ssize_t err = 0;
size_t count, ocount;
-
- vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
+ bool sync = (file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host);
mutex_lock(&inode->i_mutex);
@@ -1414,17 +1650,19 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
* although we have opened a file as writable, we have
* to stop this write operation to ensure FS consistency.
*/
- if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
+ if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) {
mutex_unlock(&inode->i_mutex);
err = -EROFS;
goto out;
}
- err = file_update_time(file);
- if (err) {
- mutex_unlock(&inode->i_mutex);
- goto out;
- }
+ /*
+ * We reserve space for updating the inode when we reserve space for the
+ * extent we are going to write, so we will enospc out there. We don't
+ * need to start yet another transaction to update the inode as we will
+ * update the inode when we finish writing whatever data we write.
+ */
+ update_time_for_write(inode);
start_pos = round_down(pos, root->sectorsize);
if (start_pos > i_size_read(inode)) {
@@ -1435,6 +1673,9 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
}
}
+ if (sync)
+ atomic_inc(&BTRFS_I(inode)->sync_writers);
+
if (unlikely(file->f_flags & O_DIRECT)) {
num_written = __btrfs_direct_write(iocb, iov, nr_segs,
pos, ppos, count, ocount);
@@ -1461,13 +1702,21 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
* this will either be one more than the running transaction
* or the generation used for the next transaction if there isn't
* one running right now.
+ *
+ * We also have to set last_sub_trans to the current log transid,
+ * otherwise subsequent syncs to a file that's been synced in this
+ * transaction will appear to have already occured.
*/
BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
+ BTRFS_I(inode)->last_sub_trans = root->log_transid;
if (num_written > 0 || num_written == -EIOCBQUEUED) {
err = generic_write_sync(file, pos, num_written);
if (err < 0 && num_written > 0)
num_written = err;
}
+
+ if (sync)
+ atomic_dec(&BTRFS_I(inode)->sync_writers);
out:
current->backing_dev_info = NULL;
return num_written ? num_written : err;
@@ -1483,7 +1732,20 @@ int btrfs_release_file(struct inode *inode, struct file *filp)
*/
if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
&BTRFS_I(inode)->runtime_flags)) {
- btrfs_add_ordered_operation(NULL, BTRFS_I(inode)->root, inode);
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+
+ /*
+ * We need to block on a committing transaction to keep us from
+ * throwing a ordered operation on to the list and causing
+ * something like sync to deadlock trying to flush out this
+ * inode.
+ */
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+ btrfs_add_ordered_operation(trans, BTRFS_I(inode)->root, inode);
+ btrfs_end_transaction(trans, root);
if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
filemap_flush(inode->i_mapping);
}
@@ -1510,19 +1772,37 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
struct btrfs_root *root = BTRFS_I(inode)->root;
int ret = 0;
struct btrfs_trans_handle *trans;
+ bool full_sync = 0;
trace_btrfs_sync_file(file, datasync);
+ /*
+ * We write the dirty pages in the range and wait until they complete
+ * out of the ->i_mutex. If so, we can flush the dirty pages by
+ * multi-task, and make the performance up. See
+ * btrfs_wait_ordered_range for an explanation of the ASYNC check.
+ */
+ atomic_inc(&BTRFS_I(inode)->sync_writers);
+ ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
+ if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+ &BTRFS_I(inode)->runtime_flags))
+ ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
+ atomic_dec(&BTRFS_I(inode)->sync_writers);
+ if (ret)
+ return ret;
+
mutex_lock(&inode->i_mutex);
/*
- * we wait first, since the writeback may change the inode, also wait
- * ordered range does a filemape_write_and_wait_range which is why we
- * don't do it above like other file systems.
+ * We flush the dirty pages again to avoid some dirty pages in the
+ * range being left.
*/
- root->log_batch++;
- btrfs_wait_ordered_range(inode, start, end);
- root->log_batch++;
+ atomic_inc(&root->log_batch);
+ full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+ &BTRFS_I(inode)->runtime_flags);
+ if (full_sync)
+ btrfs_wait_ordered_range(inode, start, end - start + 1);
+ atomic_inc(&root->log_batch);
/*
* check the transaction that last modified this inode
@@ -1543,6 +1823,14 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
BTRFS_I(inode)->last_trans <=
root->fs_info->last_trans_committed) {
BTRFS_I(inode)->last_trans = 0;
+
+ /*
+ * We'v had everything committed since the last time we were
+ * modified so clear this flag in case it was set for whatever
+ * reason, it's no longer relevant.
+ */
+ clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+ &BTRFS_I(inode)->runtime_flags);
mutex_unlock(&inode->i_mutex);
goto out;
}
@@ -1580,13 +1868,25 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
if (ret != BTRFS_NO_LOG_SYNC) {
if (ret > 0) {
+ /*
+ * If we didn't already wait for ordered extents we need
+ * to do that now.
+ */
+ if (!full_sync)
+ btrfs_wait_ordered_range(inode, start,
+ end - start + 1);
ret = btrfs_commit_transaction(trans, root);
} else {
ret = btrfs_sync_log(trans, root);
- if (ret == 0)
+ if (ret == 0) {
ret = btrfs_end_transaction(trans, root);
- else
+ } else {
+ if (!full_sync)
+ btrfs_wait_ordered_range(inode, start,
+ end -
+ start + 1);
ret = btrfs_commit_transaction(trans, root);
+ }
}
} else {
ret = btrfs_end_transaction(trans, root);
@@ -1598,6 +1898,7 @@ out:
static const struct vm_operations_struct btrfs_file_vm_ops = {
.fault = filemap_fault,
.page_mkwrite = btrfs_page_mkwrite,
+ .remap_pages = generic_file_remap_pages,
};
static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma)
@@ -1609,46 +1910,369 @@ static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma)
file_accessed(filp);
vma->vm_ops = &btrfs_file_vm_ops;
- vma->vm_flags |= VM_CAN_NONLINEAR;
return 0;
}
+static int hole_mergeable(struct inode *inode, struct extent_buffer *leaf,
+ int slot, u64 start, u64 end)
+{
+ struct btrfs_file_extent_item *fi;
+ struct btrfs_key key;
+
+ if (slot < 0 || slot >= btrfs_header_nritems(leaf))
+ return 0;
+
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ if (key.objectid != btrfs_ino(inode) ||
+ key.type != BTRFS_EXTENT_DATA_KEY)
+ return 0;
+
+ fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+
+ if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
+ return 0;
+
+ if (btrfs_file_extent_disk_bytenr(leaf, fi))
+ return 0;
+
+ if (key.offset == end)
+ return 1;
+ if (key.offset + btrfs_file_extent_num_bytes(leaf, fi) == start)
+ return 1;
+ return 0;
+}
+
+static int fill_holes(struct btrfs_trans_handle *trans, struct inode *inode,
+ struct btrfs_path *path, u64 offset, u64 end)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct extent_buffer *leaf;
+ struct btrfs_file_extent_item *fi;
+ struct extent_map *hole_em;
+ struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+ struct btrfs_key key;
+ int ret;
+
+ key.objectid = btrfs_ino(inode);
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = offset;
+
+
+ ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+ if (ret < 0)
+ return ret;
+ BUG_ON(!ret);
+
+ leaf = path->nodes[0];
+ if (hole_mergeable(inode, leaf, path->slots[0]-1, offset, end)) {
+ u64 num_bytes;
+
+ path->slots[0]--;
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ num_bytes = btrfs_file_extent_num_bytes(leaf, fi) +
+ end - offset;
+ btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
+ btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
+ btrfs_set_file_extent_offset(leaf, fi, 0);
+ btrfs_mark_buffer_dirty(leaf);
+ goto out;
+ }
+
+ if (hole_mergeable(inode, leaf, path->slots[0]+1, offset, end)) {
+ u64 num_bytes;
+
+ path->slots[0]++;
+ key.offset = offset;
+ btrfs_set_item_key_safe(root, path, &key);
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+ num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end -
+ offset;
+ btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
+ btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
+ btrfs_set_file_extent_offset(leaf, fi, 0);
+ btrfs_mark_buffer_dirty(leaf);
+ goto out;
+ }
+ btrfs_release_path(path);
+
+ ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode), offset,
+ 0, 0, end - offset, 0, end - offset,
+ 0, 0, 0);
+ if (ret)
+ return ret;
+
+out:
+ btrfs_release_path(path);
+
+ hole_em = alloc_extent_map();
+ if (!hole_em) {
+ btrfs_drop_extent_cache(inode, offset, end - 1, 0);
+ set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+ &BTRFS_I(inode)->runtime_flags);
+ } else {
+ hole_em->start = offset;
+ hole_em->len = end - offset;
+ hole_em->ram_bytes = hole_em->len;
+ hole_em->orig_start = offset;
+
+ hole_em->block_start = EXTENT_MAP_HOLE;
+ hole_em->block_len = 0;
+ hole_em->orig_block_len = 0;
+ hole_em->bdev = root->fs_info->fs_devices->latest_bdev;
+ hole_em->compress_type = BTRFS_COMPRESS_NONE;
+ hole_em->generation = trans->transid;
+
+ do {
+ btrfs_drop_extent_cache(inode, offset, end - 1, 0);
+ write_lock(&em_tree->lock);
+ ret = add_extent_mapping(em_tree, hole_em, 1);
+ write_unlock(&em_tree->lock);
+ } while (ret == -EEXIST);
+ free_extent_map(hole_em);
+ if (ret)
+ set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+ &BTRFS_I(inode)->runtime_flags);
+ }
+
+ return 0;
+}
+
+static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct extent_state *cached_state = NULL;
+ struct btrfs_path *path;
+ struct btrfs_block_rsv *rsv;
+ struct btrfs_trans_handle *trans;
+ u64 lockstart = round_up(offset, BTRFS_I(inode)->root->sectorsize);
+ u64 lockend = round_down(offset + len,
+ BTRFS_I(inode)->root->sectorsize) - 1;
+ u64 cur_offset = lockstart;
+ u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
+ u64 drop_end;
+ int ret = 0;
+ int err = 0;
+ bool same_page = ((offset >> PAGE_CACHE_SHIFT) ==
+ ((offset + len - 1) >> PAGE_CACHE_SHIFT));
+
+ btrfs_wait_ordered_range(inode, offset, len);
+
+ mutex_lock(&inode->i_mutex);
+ /*
+ * We needn't truncate any page which is beyond the end of the file
+ * because we are sure there is no data there.
+ */
+ /*
+ * Only do this if we are in the same page and we aren't doing the
+ * entire page.
+ */
+ if (same_page && len < PAGE_CACHE_SIZE) {
+ if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE))
+ ret = btrfs_truncate_page(inode, offset, len, 0);
+ mutex_unlock(&inode->i_mutex);
+ return ret;
+ }
+
+ /* zero back part of the first page */
+ if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE)) {
+ ret = btrfs_truncate_page(inode, offset, 0, 0);
+ if (ret) {
+ mutex_unlock(&inode->i_mutex);
+ return ret;
+ }
+ }
+
+ /* zero the front end of the last page */
+ if (offset + len < round_up(inode->i_size, PAGE_CACHE_SIZE)) {
+ ret = btrfs_truncate_page(inode, offset + len, 0, 1);
+ if (ret) {
+ mutex_unlock(&inode->i_mutex);
+ return ret;
+ }
+ }
+
+ if (lockend < lockstart) {
+ mutex_unlock(&inode->i_mutex);
+ return 0;
+ }
+
+ while (1) {
+ struct btrfs_ordered_extent *ordered;
+
+ truncate_pagecache_range(inode, lockstart, lockend);
+
+ lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ 0, &cached_state);
+ ordered = btrfs_lookup_first_ordered_extent(inode, lockend);
+
+ /*
+ * We need to make sure we have no ordered extents in this range
+ * and nobody raced in and read a page in this range, if we did
+ * we need to try again.
+ */
+ if ((!ordered ||
+ (ordered->file_offset + ordered->len < lockstart ||
+ ordered->file_offset > lockend)) &&
+ !test_range_bit(&BTRFS_I(inode)->io_tree, lockstart,
+ lockend, EXTENT_UPTODATE, 0,
+ cached_state)) {
+ if (ordered)
+ btrfs_put_ordered_extent(ordered);
+ break;
+ }
+ if (ordered)
+ btrfs_put_ordered_extent(ordered);
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
+ lockend, &cached_state, GFP_NOFS);
+ btrfs_wait_ordered_range(inode, lockstart,
+ lockend - lockstart + 1);
+ }
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
+ if (!rsv) {
+ ret = -ENOMEM;
+ goto out_free;
+ }
+ rsv->size = btrfs_calc_trunc_metadata_size(root, 1);
+ rsv->failfast = 1;
+
+ /*
+ * 1 - update the inode
+ * 1 - removing the extents in the range
+ * 1 - adding the hole extent
+ */
+ trans = btrfs_start_transaction(root, 3);
+ if (IS_ERR(trans)) {
+ err = PTR_ERR(trans);
+ goto out_free;
+ }
+
+ ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv,
+ min_size);
+ BUG_ON(ret);
+ trans->block_rsv = rsv;
+
+ while (cur_offset < lockend) {
+ ret = __btrfs_drop_extents(trans, root, inode, path,
+ cur_offset, lockend + 1,
+ &drop_end, 1);
+ if (ret != -ENOSPC)
+ break;
+
+ trans->block_rsv = &root->fs_info->trans_block_rsv;
+
+ ret = fill_holes(trans, inode, path, cur_offset, drop_end);
+ if (ret) {
+ err = ret;
+ break;
+ }
+
+ cur_offset = drop_end;
+
+ ret = btrfs_update_inode(trans, root, inode);
+ if (ret) {
+ err = ret;
+ break;
+ }
+
+ btrfs_end_transaction(trans, root);
+ btrfs_btree_balance_dirty(root);
+
+ trans = btrfs_start_transaction(root, 3);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ trans = NULL;
+ break;
+ }
+
+ ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv,
+ rsv, min_size);
+ BUG_ON(ret); /* shouldn't happen */
+ trans->block_rsv = rsv;
+ }
+
+ if (ret) {
+ err = ret;
+ goto out_trans;
+ }
+
+ trans->block_rsv = &root->fs_info->trans_block_rsv;
+ ret = fill_holes(trans, inode, path, cur_offset, drop_end);
+ if (ret) {
+ err = ret;
+ goto out_trans;
+ }
+
+out_trans:
+ if (!trans)
+ goto out_free;
+
+ inode_inc_iversion(inode);
+ inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+
+ trans->block_rsv = &root->fs_info->trans_block_rsv;
+ ret = btrfs_update_inode(trans, root, inode);
+ btrfs_end_transaction(trans, root);
+ btrfs_btree_balance_dirty(root);
+out_free:
+ btrfs_free_path(path);
+ btrfs_free_block_rsv(root, rsv);
+out:
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ &cached_state, GFP_NOFS);
+ mutex_unlock(&inode->i_mutex);
+ if (ret && !err)
+ err = ret;
+ return err;
+}
+
static long btrfs_fallocate(struct file *file, int mode,
loff_t offset, loff_t len)
{
- struct inode *inode = file->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(file);
struct extent_state *cached_state = NULL;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
u64 cur_offset;
u64 last_byte;
u64 alloc_start;
u64 alloc_end;
u64 alloc_hint = 0;
u64 locked_end;
- u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
struct extent_map *em;
+ int blocksize = BTRFS_I(inode)->root->sectorsize;
int ret;
- alloc_start = offset & ~mask;
- alloc_end = (offset + len + mask) & ~mask;
+ alloc_start = round_down(offset, blocksize);
+ alloc_end = round_up(offset + len, blocksize);
- /* We only support the FALLOC_FL_KEEP_SIZE mode */
- if (mode & ~FALLOC_FL_KEEP_SIZE)
+ /* Make sure we aren't being give some crap mode */
+ if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
return -EOPNOTSUPP;
+ if (mode & FALLOC_FL_PUNCH_HOLE)
+ return btrfs_punch_hole(inode, offset, len);
+
/*
* Make sure we have enough space before we do the
* allocation.
*/
- ret = btrfs_check_data_free_space(inode, len);
+ ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
if (ret)
return ret;
-
- /*
- * wait for ordered IO before we have any locks. We'll loop again
- * below with the locks held.
- */
- btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
+ if (root->fs_info->quota_enabled) {
+ ret = btrfs_qgroup_reserve(root, alloc_end - alloc_start);
+ if (ret)
+ goto out_reserve_fail;
+ }
mutex_lock(&inode->i_mutex);
ret = inode_newsize_ok(inode, alloc_end);
@@ -1660,8 +2284,23 @@ static long btrfs_fallocate(struct file *file, int mode,
alloc_start);
if (ret)
goto out;
+ } else {
+ /*
+ * If we are fallocating from the end of the file onward we
+ * need to zero out the end of the page if i_size lands in the
+ * middle of a page.
+ */
+ ret = btrfs_truncate_page(inode, inode->i_size, 0, 0);
+ if (ret)
+ goto out;
}
+ /*
+ * wait for ordered IO before we have any locks. We'll loop again
+ * below with the locks held.
+ */
+ btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
+
locked_end = alloc_end - 1;
while (1) {
struct btrfs_ordered_extent *ordered;
@@ -1708,7 +2347,7 @@ static long btrfs_fallocate(struct file *file, int mode,
}
last_byte = min(extent_map_end(em), alloc_end);
actual_end = min_t(u64, extent_map_end(em), offset + len);
- last_byte = (last_byte + mask) & ~mask;
+ last_byte = ALIGN(last_byte, blocksize);
if (em->block_start == EXTENT_MAP_HOLE ||
(cur_offset >= inode->i_size &&
@@ -1746,12 +2385,15 @@ static long btrfs_fallocate(struct file *file, int mode,
&cached_state, GFP_NOFS);
out:
mutex_unlock(&inode->i_mutex);
+ if (root->fs_info->quota_enabled)
+ btrfs_qgroup_free(root, alloc_end - alloc_start);
+out_reserve_fail:
/* Let go of our reservation. */
- btrfs_free_reserved_data_space(inode, len);
+ btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
return ret;
}
-static int find_desired_extent(struct inode *inode, loff_t *offset, int origin)
+static int find_desired_extent(struct inode *inode, loff_t *offset, int whence)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
struct extent_map *em;
@@ -1768,6 +2410,7 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int origin)
if (lockend <= lockstart)
lockend = lockstart + root->sectorsize;
+ lockend--;
len = lockend - lockstart + 1;
len = max_t(u64, len, root->sectorsize);
@@ -1785,7 +2428,7 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int origin)
* before the position we want in case there is outstanding delalloc
* going on here.
*/
- if (origin == SEEK_HOLE && start != 0) {
+ if (whence == SEEK_HOLE && start != 0) {
if (start <= root->sectorsize)
em = btrfs_get_extent_fiemap(inode, NULL, 0, 0,
root->sectorsize, 0);
@@ -1819,13 +2462,13 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int origin)
}
}
- if (origin == SEEK_HOLE) {
+ if (whence == SEEK_HOLE) {
*offset = start;
free_extent_map(em);
break;
}
} else {
- if (origin == SEEK_DATA) {
+ if (whence == SEEK_DATA) {
if (em->block_start == EXTENT_MAP_DELALLOC) {
if (start >= inode->i_size) {
free_extent_map(em);
@@ -1834,9 +2477,12 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int origin)
}
}
- *offset = start;
- free_extent_map(em);
- break;
+ if (!test_bit(EXTENT_FLAG_PREALLOC,
+ &em->flags)) {
+ *offset = start;
+ free_extent_map(em);
+ break;
+ }
}
}
@@ -1862,16 +2508,16 @@ out:
return ret;
}
-static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int origin)
+static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
{
struct inode *inode = file->f_mapping->host;
int ret;
mutex_lock(&inode->i_mutex);
- switch (origin) {
+ switch (whence) {
case SEEK_END:
case SEEK_CUR:
- offset = generic_file_llseek(file, offset, origin);
+ offset = generic_file_llseek(file, offset, whence);
goto out;
case SEEK_DATA:
case SEEK_HOLE:
@@ -1880,27 +2526,14 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int origin)
return -ENXIO;
}
- ret = find_desired_extent(inode, &offset, origin);
+ ret = find_desired_extent(inode, &offset, whence);
if (ret) {
mutex_unlock(&inode->i_mutex);
return ret;
}
}
- if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET)) {
- offset = -EINVAL;
- goto out;
- }
- if (offset > inode->i_sb->s_maxbytes) {
- offset = -EINVAL;
- goto out;
- }
-
- /* Special lock needed here? */
- if (offset != file->f_pos) {
- file->f_pos = offset;
- file->f_version = 0;
- }
+ offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
out:
mutex_unlock(&inode->i_mutex);
return offset;
@@ -1923,3 +2556,21 @@ const struct file_operations btrfs_file_operations = {
.compat_ioctl = btrfs_ioctl,
#endif
};
+
+void btrfs_auto_defrag_exit(void)
+{
+ if (btrfs_inode_defrag_cachep)
+ kmem_cache_destroy(btrfs_inode_defrag_cachep);
+}
+
+int btrfs_auto_defrag_init(void)
+{
+ btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag",
+ sizeof(struct inode_defrag), 0,
+ SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+ NULL);
+ if (!btrfs_inode_defrag_cachep)
+ return -ENOMEM;
+
+ return 0;
+}