summaryrefslogtreecommitdiff
path: root/fs/btrfs/extent-tree.c
diff options
context:
space:
mode:
authorDmitry Torokhov <dmitry.torokhov@gmail.com>2013-05-01 08:47:44 -0700
committerDmitry Torokhov <dmitry.torokhov@gmail.com>2013-05-01 08:47:44 -0700
commitbf61c8840efe60fd8f91446860b63338fb424158 (patch)
tree7a71832407a4f0d6346db773343f4c3ae2257b19 /fs/btrfs/extent-tree.c
parent5846115b30f3a881e542c8bfde59a699c1c13740 (diff)
parent0c6a61657da78098472fd0eb71cc01f2387fa1bb (diff)
Merge branch 'next' into for-linus
Prepare first set of updates for 3.10 merge window.
Diffstat (limited to 'fs/btrfs/extent-tree.c')
-rw-r--r--fs/btrfs/extent-tree.c762
1 files changed, 500 insertions, 262 deletions
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 3d3e2c17d8d1..9ac2eca681eb 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -31,8 +31,10 @@
#include "print-tree.h"
#include "transaction.h"
#include "volumes.h"
+#include "raid56.h"
#include "locking.h"
#include "free-space-cache.h"
+#include "math.h"
#undef SCRAMBLE_DELAYED_REFS
@@ -71,8 +73,7 @@ enum {
RESERVE_ALLOC_NO_ACCOUNT = 2,
};
-static int update_block_group(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
+static int update_block_group(struct btrfs_root *root,
u64 bytenr, u64 num_bytes, int alloc);
static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
@@ -102,6 +103,8 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
int dump_block_groups);
static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
u64 num_bytes, int reserve);
+static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
+ u64 num_bytes);
static noinline int
block_group_cache_done(struct btrfs_block_group_cache *cache)
@@ -161,6 +164,10 @@ static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
rb_link_node(&block_group->cache_node, parent, p);
rb_insert_color(&block_group->cache_node,
&info->block_group_cache_tree);
+
+ if (info->first_logical_byte > block_group->key.objectid)
+ info->first_logical_byte = block_group->key.objectid;
+
spin_unlock(&info->block_group_cache_lock);
return 0;
@@ -202,8 +209,11 @@ block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
break;
}
}
- if (ret)
+ if (ret) {
btrfs_get_block_group(ret);
+ if (bytenr == 0 && info->first_logical_byte > ret->key.objectid)
+ info->first_logical_byte = ret->key.objectid;
+ }
spin_unlock(&info->block_group_cache_lock);
return ret;
@@ -467,8 +477,6 @@ out:
}
static int cache_block_group(struct btrfs_block_group_cache *cache,
- struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
int load_cache_only)
{
DEFINE_WAIT(wait);
@@ -526,12 +534,6 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
cache->cached = BTRFS_CACHE_FAST;
spin_unlock(&cache->lock);
- /*
- * We can't do the read from on-disk cache during a commit since we need
- * to have the normal tree locking. Also if we are currently trying to
- * allocate blocks for the tree root we can't do the fast caching since
- * we likely hold important locks.
- */
if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) {
ret = load_free_space_cache(fs_info, cache);
@@ -649,24 +651,6 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
rcu_read_unlock();
}
-static u64 div_factor(u64 num, int factor)
-{
- if (factor == 10)
- return num;
- num *= factor;
- do_div(num, 10);
- return num;
-}
-
-static u64 div_factor_fine(u64 num, int factor)
-{
- if (factor == 100)
- return num;
- num *= factor;
- do_div(num, 100);
- return num;
-}
-
u64 btrfs_find_block_group(struct btrfs_root *root,
u64 search_start, u64 search_hint, int owner)
{
@@ -1483,8 +1467,11 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
if (ret && !insert) {
err = -ENOENT;
goto out;
+ } else if (ret) {
+ err = -EIO;
+ WARN_ON(1);
+ goto out;
}
- BUG_ON(ret); /* Corruption */
leaf = path->nodes[0];
item_size = btrfs_item_size_nr(leaf, path->slots[0]);
@@ -1835,7 +1822,7 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
/* Tell the block device(s) that the sectors can be discarded */
- ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD,
+ ret = btrfs_map_block(root->fs_info, REQ_DISCARD,
bytenr, &num_bytes, &bbio, 0);
/* Error condition is -ENOMEM */
if (!ret) {
@@ -1869,6 +1856,8 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
*actual_bytes = discarded_bytes;
+ if (ret == -EOPNOTSUPP)
+ ret = 0;
return ret;
}
@@ -2160,7 +2149,6 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
node->num_bytes);
}
}
- mutex_unlock(&head->mutex);
return ret;
}
@@ -2275,7 +2263,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
* process of being added. Don't run this ref yet.
*/
list_del_init(&locked_ref->cluster);
- mutex_unlock(&locked_ref->mutex);
+ btrfs_delayed_ref_unlock(locked_ref);
locked_ref = NULL;
delayed_refs->num_heads_ready++;
spin_unlock(&delayed_refs->lock);
@@ -2302,7 +2290,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
ref = &locked_ref->node;
if (extent_op && must_insert_reserved) {
- kfree(extent_op);
+ btrfs_free_delayed_extent_op(extent_op);
extent_op = NULL;
}
@@ -2311,25 +2299,25 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
ret = run_delayed_extent_op(trans, root,
ref, extent_op);
- kfree(extent_op);
+ btrfs_free_delayed_extent_op(extent_op);
if (ret) {
- printk(KERN_DEBUG "btrfs: run_delayed_extent_op returned %d\n", ret);
+ printk(KERN_DEBUG
+ "btrfs: run_delayed_extent_op "
+ "returned %d\n", ret);
spin_lock(&delayed_refs->lock);
+ btrfs_delayed_ref_unlock(locked_ref);
return ret;
}
goto next;
}
-
- list_del_init(&locked_ref->cluster);
- locked_ref = NULL;
}
ref->in_tree = 0;
rb_erase(&ref->rb_node, &delayed_refs->root);
delayed_refs->num_entries--;
- if (locked_ref) {
+ if (!btrfs_delayed_ref_is_head(ref)) {
/*
* when we play the delayed ref, also correct the
* ref_mod on head
@@ -2351,16 +2339,29 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
ret = run_one_delayed_ref(trans, root, ref, extent_op,
must_insert_reserved);
- btrfs_put_delayed_ref(ref);
- kfree(extent_op);
- count++;
-
+ btrfs_free_delayed_extent_op(extent_op);
if (ret) {
- printk(KERN_DEBUG "btrfs: run_one_delayed_ref returned %d\n", ret);
+ btrfs_delayed_ref_unlock(locked_ref);
+ btrfs_put_delayed_ref(ref);
+ printk(KERN_DEBUG
+ "btrfs: run_one_delayed_ref returned %d\n", ret);
spin_lock(&delayed_refs->lock);
return ret;
}
+ /*
+ * If this node is a head, that means all the refs in this head
+ * have been dealt with, and we will pick the next head to deal
+ * with, so we must unlock the head and drop it from the cluster
+ * list before we release it.
+ */
+ if (btrfs_delayed_ref_is_head(ref)) {
+ list_del_init(&locked_ref->cluster);
+ btrfs_delayed_ref_unlock(locked_ref);
+ locked_ref = NULL;
+ }
+ btrfs_put_delayed_ref(ref);
+ count++;
next:
cond_resched();
spin_lock(&delayed_refs->lock);
@@ -2445,6 +2446,16 @@ int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
return ret;
}
+static int refs_newer(struct btrfs_delayed_ref_root *delayed_refs, int seq,
+ int count)
+{
+ int val = atomic_read(&delayed_refs->ref_seq);
+
+ if (val < seq || val >= seq + count)
+ return 1;
+ return 0;
+}
+
/*
* this starts processing the delayed reference count updates and
* extent insertions we have queued up so far. count can be
@@ -2479,6 +2490,44 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
delayed_refs = &trans->transaction->delayed_refs;
INIT_LIST_HEAD(&cluster);
+ if (count == 0) {
+ count = delayed_refs->num_entries * 2;
+ run_most = 1;
+ }
+
+ if (!run_all && !run_most) {
+ int old;
+ int seq = atomic_read(&delayed_refs->ref_seq);
+
+progress:
+ old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1);
+ if (old) {
+ DEFINE_WAIT(__wait);
+ if (delayed_refs->num_entries < 16348)
+ return 0;
+
+ prepare_to_wait(&delayed_refs->wait, &__wait,
+ TASK_UNINTERRUPTIBLE);
+
+ old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1);
+ if (old) {
+ schedule();
+ finish_wait(&delayed_refs->wait, &__wait);
+
+ if (!refs_newer(delayed_refs, seq, 256))
+ goto progress;
+ else
+ return 0;
+ } else {
+ finish_wait(&delayed_refs->wait, &__wait);
+ goto again;
+ }
+ }
+
+ } else {
+ atomic_inc(&delayed_refs->procs_running_refs);
+ }
+
again:
loops = 0;
spin_lock(&delayed_refs->lock);
@@ -2487,10 +2536,6 @@ again:
delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
#endif
- if (count == 0) {
- count = delayed_refs->num_entries * 2;
- run_most = 1;
- }
while (1) {
if (!(run_all || run_most) &&
delayed_refs->num_heads_ready < 64)
@@ -2510,11 +2555,15 @@ again:
ret = run_clustered_refs(trans, root, &cluster);
if (ret < 0) {
+ btrfs_release_ref_cluster(&cluster);
spin_unlock(&delayed_refs->lock);
btrfs_abort_transaction(trans, root, ret);
+ atomic_dec(&delayed_refs->procs_running_refs);
return ret;
}
+ atomic_add(ret, &delayed_refs->ref_seq);
+
count -= min_t(unsigned long, ret, count);
if (count == 0)
@@ -2583,6 +2632,11 @@ again:
goto again;
}
out:
+ atomic_dec(&delayed_refs->procs_running_refs);
+ smp_mb();
+ if (waitqueue_active(&delayed_refs->wait))
+ wake_up(&delayed_refs->wait);
+
spin_unlock(&delayed_refs->lock);
assert_qgroups_uptodate(trans);
return 0;
@@ -2596,7 +2650,7 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
struct btrfs_delayed_extent_op *extent_op;
int ret;
- extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
+ extent_op = btrfs_alloc_delayed_extent_op();
if (!extent_op)
return -ENOMEM;
@@ -2608,7 +2662,7 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr,
num_bytes, extent_op);
if (ret)
- kfree(extent_op);
+ btrfs_free_delayed_extent_op(extent_op);
return ret;
}
@@ -3233,12 +3287,14 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
u64 extra_flags = chunk_to_extended(flags) &
BTRFS_EXTENDED_PROFILE_MASK;
+ write_seqlock(&fs_info->profiles_lock);
if (flags & BTRFS_BLOCK_GROUP_DATA)
fs_info->avail_data_alloc_bits |= extra_flags;
if (flags & BTRFS_BLOCK_GROUP_METADATA)
fs_info->avail_metadata_alloc_bits |= extra_flags;
if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
fs_info->avail_system_alloc_bits |= extra_flags;
+ write_sequnlock(&fs_info->profiles_lock);
}
/*
@@ -3286,6 +3342,7 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
u64 num_devices = root->fs_info->fs_devices->rw_devices +
root->fs_info->fs_devices->missing_devices;
u64 target;
+ u64 tmp;
/*
* see if restripe for this chunk_type is in progress, if so
@@ -3302,40 +3359,48 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
}
spin_unlock(&root->fs_info->balance_lock);
+ /* First, mask out the RAID levels which aren't possible */
if (num_devices == 1)
- flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
+ flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0 |
+ BTRFS_BLOCK_GROUP_RAID5);
+ if (num_devices < 3)
+ flags &= ~BTRFS_BLOCK_GROUP_RAID6;
if (num_devices < 4)
flags &= ~BTRFS_BLOCK_GROUP_RAID10;
- if ((flags & BTRFS_BLOCK_GROUP_DUP) &&
- (flags & (BTRFS_BLOCK_GROUP_RAID1 |
- BTRFS_BLOCK_GROUP_RAID10))) {
- flags &= ~BTRFS_BLOCK_GROUP_DUP;
- }
+ tmp = flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 |
+ BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID5 |
+ BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10);
+ flags &= ~tmp;
- if ((flags & BTRFS_BLOCK_GROUP_RAID1) &&
- (flags & BTRFS_BLOCK_GROUP_RAID10)) {
- flags &= ~BTRFS_BLOCK_GROUP_RAID1;
- }
+ if (tmp & BTRFS_BLOCK_GROUP_RAID6)
+ tmp = BTRFS_BLOCK_GROUP_RAID6;
+ else if (tmp & BTRFS_BLOCK_GROUP_RAID5)
+ tmp = BTRFS_BLOCK_GROUP_RAID5;
+ else if (tmp & BTRFS_BLOCK_GROUP_RAID10)
+ tmp = BTRFS_BLOCK_GROUP_RAID10;
+ else if (tmp & BTRFS_BLOCK_GROUP_RAID1)
+ tmp = BTRFS_BLOCK_GROUP_RAID1;
+ else if (tmp & BTRFS_BLOCK_GROUP_RAID0)
+ tmp = BTRFS_BLOCK_GROUP_RAID0;
- if ((flags & BTRFS_BLOCK_GROUP_RAID0) &&
- ((flags & BTRFS_BLOCK_GROUP_RAID1) |
- (flags & BTRFS_BLOCK_GROUP_RAID10) |
- (flags & BTRFS_BLOCK_GROUP_DUP))) {
- flags &= ~BTRFS_BLOCK_GROUP_RAID0;
- }
-
- return extended_to_chunk(flags);
+ return extended_to_chunk(flags | tmp);
}
static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
{
- if (flags & BTRFS_BLOCK_GROUP_DATA)
- flags |= root->fs_info->avail_data_alloc_bits;
- else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
- flags |= root->fs_info->avail_system_alloc_bits;
- else if (flags & BTRFS_BLOCK_GROUP_METADATA)
- flags |= root->fs_info->avail_metadata_alloc_bits;
+ unsigned seq;
+
+ do {
+ seq = read_seqbegin(&root->fs_info->profiles_lock);
+
+ if (flags & BTRFS_BLOCK_GROUP_DATA)
+ flags |= root->fs_info->avail_data_alloc_bits;
+ else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
+ flags |= root->fs_info->avail_system_alloc_bits;
+ else if (flags & BTRFS_BLOCK_GROUP_METADATA)
+ flags |= root->fs_info->avail_metadata_alloc_bits;
+ } while (read_seqretry(&root->fs_info->profiles_lock, seq));
return btrfs_reduce_alloc_profile(root, flags);
}
@@ -3343,6 +3408,7 @@ static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
{
u64 flags;
+ u64 ret;
if (data)
flags = BTRFS_BLOCK_GROUP_DATA;
@@ -3351,7 +3417,8 @@ u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
else
flags = BTRFS_BLOCK_GROUP_METADATA;
- return get_alloc_profile(root, flags);
+ ret = get_alloc_profile(root, flags);
+ return ret;
}
/*
@@ -3367,7 +3434,7 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
int ret = 0, committed = 0, alloc_chunk = 1;
/* make sure bytes are sectorsize aligned */
- bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
+ bytes = ALIGN(bytes, root->sectorsize);
if (root == root->fs_info->tree_root ||
BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID) {
@@ -3462,7 +3529,7 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
struct btrfs_space_info *data_sinfo;
/* make sure bytes are sectorsize aligned */
- bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
+ bytes = ALIGN(bytes, root->sectorsize);
data_sinfo = root->fs_info->data_sinfo;
spin_lock(&data_sinfo->lock);
@@ -3526,8 +3593,10 @@ static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type)
{
u64 num_dev;
- if (type & BTRFS_BLOCK_GROUP_RAID10 ||
- type & BTRFS_BLOCK_GROUP_RAID0)
+ if (type & (BTRFS_BLOCK_GROUP_RAID10 |
+ BTRFS_BLOCK_GROUP_RAID0 |
+ BTRFS_BLOCK_GROUP_RAID5 |
+ BTRFS_BLOCK_GROUP_RAID6))
num_dev = root->fs_info->fs_devices->rw_devices;
else if (type & BTRFS_BLOCK_GROUP_RAID1)
num_dev = 2;
@@ -3574,6 +3643,10 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
int wait_for_alloc = 0;
int ret = 0;
+ /* Don't re-enter if we're already allocating a chunk */
+ if (trans->allocating_chunk)
+ return -ENOSPC;
+
space_info = __find_space_info(extent_root->fs_info, flags);
if (!space_info) {
ret = update_space_info(extent_root->fs_info, flags,
@@ -3616,6 +3689,8 @@ again:
goto again;
}
+ trans->allocating_chunk = true;
+
/*
* If we have mixed data/metadata chunks we want to make sure we keep
* allocating mixed chunks instead of individual chunks.
@@ -3642,34 +3717,53 @@ again:
check_system_chunk(trans, extent_root, flags);
ret = btrfs_alloc_chunk(trans, extent_root, flags);
- if (ret < 0 && ret != -ENOSPC)
- goto out;
+ trans->allocating_chunk = false;
spin_lock(&space_info->lock);
+ if (ret < 0 && ret != -ENOSPC)
+ goto out;
if (ret)
space_info->full = 1;
else
ret = 1;
space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
+out:
space_info->chunk_alloc = 0;
spin_unlock(&space_info->lock);
-out:
mutex_unlock(&fs_info->chunk_mutex);
return ret;
}
static int can_overcommit(struct btrfs_root *root,
struct btrfs_space_info *space_info, u64 bytes,
- int flush)
+ enum btrfs_reserve_flush_enum flush)
{
+ struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
u64 profile = btrfs_get_alloc_profile(root, 0);
+ u64 rsv_size = 0;
u64 avail;
u64 used;
+ u64 to_add;
used = space_info->bytes_used + space_info->bytes_reserved +
- space_info->bytes_pinned + space_info->bytes_readonly +
- space_info->bytes_may_use;
+ space_info->bytes_pinned + space_info->bytes_readonly;
+
+ spin_lock(&global_rsv->lock);
+ rsv_size = global_rsv->size;
+ spin_unlock(&global_rsv->lock);
+
+ /*
+ * We only want to allow over committing if we have lots of actual space
+ * free, but if we don't have enough space to handle the global reserve
+ * space then we could end up having a real enospc problem when trying
+ * to allocate a chunk or some other such important allocation.
+ */
+ rsv_size <<= 1;
+ if (used + rsv_size >= space_info->total_bytes)
+ return 0;
+
+ used += space_info->bytes_may_use;
spin_lock(&root->fs_info->free_chunk_lock);
avail = root->fs_info->free_chunk_space;
@@ -3677,28 +3771,60 @@ static int can_overcommit(struct btrfs_root *root,
/*
* If we have dup, raid1 or raid10 then only half of the free
- * space is actually useable.
+ * space is actually useable. For raid56, the space info used
+ * doesn't include the parity drive, so we don't have to
+ * change the math
*/
if (profile & (BTRFS_BLOCK_GROUP_DUP |
BTRFS_BLOCK_GROUP_RAID1 |
BTRFS_BLOCK_GROUP_RAID10))
avail >>= 1;
+ to_add = space_info->total_bytes;
+
/*
- * If we aren't flushing don't let us overcommit too much, say
- * 1/8th of the space. If we can flush, let it overcommit up to
- * 1/2 of the space.
+ * If we aren't flushing all things, let us overcommit up to
+ * 1/2th of the space. If we can flush, don't let us overcommit
+ * too much, let it overcommit up to 1/8 of the space.
*/
- if (flush)
- avail >>= 3;
+ if (flush == BTRFS_RESERVE_FLUSH_ALL)
+ to_add >>= 3;
else
- avail >>= 1;
+ to_add >>= 1;
- if (used + bytes < space_info->total_bytes + avail)
+ /*
+ * Limit the overcommit to the amount of free space we could possibly
+ * allocate for chunks.
+ */
+ to_add = min(avail, to_add);
+
+ if (used + bytes < space_info->total_bytes + to_add)
return 1;
return 0;
}
+void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
+ unsigned long nr_pages)
+{
+ struct super_block *sb = root->fs_info->sb;
+ int started;
+
+ /* If we can not start writeback, just sync all the delalloc file. */
+ started = try_to_writeback_inodes_sb_nr(sb, nr_pages,
+ WB_REASON_FS_FREE_SPACE);
+ if (!started) {
+ /*
+ * We needn't worry the filesystem going from r/w to r/o though
+ * we don't acquire ->s_umount mutex, because the filesystem
+ * should guarantee the delalloc inodes list be empty after
+ * the filesystem is readonly(all dirty pages are written to
+ * the disk).
+ */
+ btrfs_start_delalloc_inodes(root, 0);
+ btrfs_wait_ordered_extents(root, 0);
+ }
+}
+
/*
* shrink metadata reservation for delalloc
*/
@@ -3713,13 +3839,15 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
long time_left;
unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
int loops = 0;
+ enum btrfs_reserve_flush_enum flush;
trans = (struct btrfs_trans_handle *)current->journal_info;
block_rsv = &root->fs_info->delalloc_block_rsv;
space_info = block_rsv->space_info;
smp_mb();
- delalloc_bytes = root->fs_info->delalloc_bytes;
+ delalloc_bytes = percpu_counter_sum_positive(
+ &root->fs_info->delalloc_bytes);
if (delalloc_bytes == 0) {
if (trans)
return;
@@ -3730,9 +3858,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
while (delalloc_bytes && loops < 3) {
max_reclaim = min(delalloc_bytes, to_reclaim);
nr_pages = max_reclaim >> PAGE_CACHE_SHIFT;
- writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages,
- WB_REASON_FS_FREE_SPACE);
-
+ btrfs_writeback_inodes_sb_nr(root, nr_pages);
/*
* We need to wait for the async pages to actually start before
* we do anything.
@@ -3740,8 +3866,12 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
wait_event(root->fs_info->async_submit_wait,
!atomic_read(&root->fs_info->async_delalloc_pages));
+ if (!trans)
+ flush = BTRFS_RESERVE_FLUSH_ALL;
+ else
+ flush = BTRFS_RESERVE_NO_FLUSH;
spin_lock(&space_info->lock);
- if (can_overcommit(root, space_info, orig, !trans)) {
+ if (can_overcommit(root, space_info, orig, flush)) {
spin_unlock(&space_info->lock);
break;
}
@@ -3756,7 +3886,8 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
break;
}
smp_mb();
- delalloc_bytes = root->fs_info->delalloc_bytes;
+ delalloc_bytes = percpu_counter_sum_positive(
+ &root->fs_info->delalloc_bytes);
}
}
@@ -3888,7 +4019,7 @@ static int flush_space(struct btrfs_root *root,
* @root - the root we're allocating for
* @block_rsv - the block_rsv we're allocating for
* @orig_bytes - the number of bytes we want
- * @flush - wether or not we can flush to make our reservation
+ * @flush - whether or not we can flush to make our reservation
*
* This will reserve orgi_bytes number of bytes from the space info associated
* with the block_rsv. If there is not enough space it will make an attempt to
@@ -3899,7 +4030,8 @@ static int flush_space(struct btrfs_root *root,
*/
static int reserve_metadata_bytes(struct btrfs_root *root,
struct btrfs_block_rsv *block_rsv,
- u64 orig_bytes, int flush)
+ u64 orig_bytes,
+ enum btrfs_reserve_flush_enum flush)
{
struct btrfs_space_info *space_info = block_rsv->space_info;
u64 used;
@@ -3912,10 +4044,11 @@ again:
ret = 0;
spin_lock(&space_info->lock);
/*
- * We only want to wait if somebody other than us is flushing and we are
- * actually alloed to flush.
+ * We only want to wait if somebody other than us is flushing and we
+ * are actually allowed to flush all things.
*/
- while (flush && !flushing && space_info->flush) {
+ while (flush == BTRFS_RESERVE_FLUSH_ALL && !flushing &&
+ space_info->flush) {
spin_unlock(&space_info->lock);
/*
* If we have a trans handle we can't wait because the flusher
@@ -3981,26 +4114,52 @@ again:
* Couldn't make our reservation, save our place so while we're trying
* to reclaim space we can actually use it instead of somebody else
* stealing it from us.
+ *
+ * We make the other tasks wait for the flush only when we can flush
+ * all things.
*/
- if (ret && flush) {
+ if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
flushing = true;
space_info->flush = 1;
}
spin_unlock(&space_info->lock);
- if (!ret || !flush)
+ if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
goto out;
ret = flush_space(root, space_info, num_bytes, orig_bytes,
flush_state);
flush_state++;
+
+ /*
+ * If we are FLUSH_LIMIT, we can not flush delalloc, or the deadlock
+ * would happen. So skip delalloc flush.
+ */
+ if (flush == BTRFS_RESERVE_FLUSH_LIMIT &&
+ (flush_state == FLUSH_DELALLOC ||
+ flush_state == FLUSH_DELALLOC_WAIT))
+ flush_state = ALLOC_CHUNK;
+
if (!ret)
goto again;
- else if (flush_state <= COMMIT_TRANS)
+ else if (flush == BTRFS_RESERVE_FLUSH_LIMIT &&
+ flush_state < COMMIT_TRANS)
+ goto again;
+ else if (flush == BTRFS_RESERVE_FLUSH_ALL &&
+ flush_state <= COMMIT_TRANS)
goto again;
out:
+ if (ret == -ENOSPC &&
+ unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
+ struct btrfs_block_rsv *global_rsv =
+ &root->fs_info->global_block_rsv;
+
+ if (block_rsv != global_rsv &&
+ !block_rsv_use_bytes(global_rsv, orig_bytes))
+ ret = 0;
+ }
if (flushing) {
spin_lock(&space_info->lock);
space_info->flush = 0;
@@ -4148,9 +4307,9 @@ void btrfs_free_block_rsv(struct btrfs_root *root,
kfree(rsv);
}
-static inline int __block_rsv_add(struct btrfs_root *root,
- struct btrfs_block_rsv *block_rsv,
- u64 num_bytes, int flush)
+int btrfs_block_rsv_add(struct btrfs_root *root,
+ struct btrfs_block_rsv *block_rsv, u64 num_bytes,
+ enum btrfs_reserve_flush_enum flush)
{
int ret;
@@ -4166,20 +4325,6 @@ static inline int __block_rsv_add(struct btrfs_root *root,
return ret;
}
-int btrfs_block_rsv_add(struct btrfs_root *root,
- struct btrfs_block_rsv *block_rsv,
- u64 num_bytes)
-{
- return __block_rsv_add(root, block_rsv, num_bytes, 1);
-}
-
-int btrfs_block_rsv_add_noflush(struct btrfs_root *root,
- struct btrfs_block_rsv *block_rsv,
- u64 num_bytes)
-{
- return __block_rsv_add(root, block_rsv, num_bytes, 0);
-}
-
int btrfs_block_rsv_check(struct btrfs_root *root,
struct btrfs_block_rsv *block_rsv, int min_factor)
{
@@ -4198,9 +4343,9 @@ int btrfs_block_rsv_check(struct btrfs_root *root,
return ret;
}
-static inline int __btrfs_block_rsv_refill(struct btrfs_root *root,
- struct btrfs_block_rsv *block_rsv,
- u64 min_reserved, int flush)
+int btrfs_block_rsv_refill(struct btrfs_root *root,
+ struct btrfs_block_rsv *block_rsv, u64 min_reserved,
+ enum btrfs_reserve_flush_enum flush)
{
u64 num_bytes = 0;
int ret = -ENOSPC;
@@ -4228,20 +4373,6 @@ static inline int __btrfs_block_rsv_refill(struct btrfs_root *root,
return ret;
}
-int btrfs_block_rsv_refill(struct btrfs_root *root,
- struct btrfs_block_rsv *block_rsv,
- u64 min_reserved)
-{
- return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 1);
-}
-
-int btrfs_block_rsv_refill_noflush(struct btrfs_root *root,
- struct btrfs_block_rsv *block_rsv,
- u64 min_reserved)
-{
- return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 0);
-}
-
int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
struct btrfs_block_rsv *dst_rsv,
u64 num_bytes)
@@ -4415,19 +4546,60 @@ void btrfs_orphan_release_metadata(struct inode *inode)
btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);
}
-int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
- struct btrfs_pending_snapshot *pending)
+/*
+ * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation
+ * root: the root of the parent directory
+ * rsv: block reservation
+ * items: the number of items that we need do reservation
+ * qgroup_reserved: used to return the reserved size in qgroup
+ *
+ * This function is used to reserve the space for snapshot/subvolume
+ * creation and deletion. Those operations are different with the
+ * common file/directory operations, they change two fs/file trees
+ * and root tree, the number of items that the qgroup reserves is
+ * different with the free space reservation. So we can not use
+ * the space reseravtion mechanism in start_transaction().
+ */
+int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
+ struct btrfs_block_rsv *rsv,
+ int items,
+ u64 *qgroup_reserved)
{
- struct btrfs_root *root = pending->root;
- struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
- struct btrfs_block_rsv *dst_rsv = &pending->block_rsv;
- /*
- * two for root back/forward refs, two for directory entries,
- * one for root of the snapshot and one for parent inode.
- */
- u64 num_bytes = btrfs_calc_trans_metadata_size(root, 6);
- dst_rsv->space_info = src_rsv->space_info;
- return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
+ u64 num_bytes;
+ int ret;
+
+ if (root->fs_info->quota_enabled) {
+ /* One for parent inode, two for dir entries */
+ num_bytes = 3 * root->leafsize;
+ ret = btrfs_qgroup_reserve(root, num_bytes);
+ if (ret)
+ return ret;
+ } else {
+ num_bytes = 0;
+ }
+
+ *qgroup_reserved = num_bytes;
+
+ num_bytes = btrfs_calc_trans_metadata_size(root, items);
+ rsv->space_info = __find_space_info(root->fs_info,
+ BTRFS_BLOCK_GROUP_METADATA);
+ ret = btrfs_block_rsv_add(root, rsv, num_bytes,
+ BTRFS_RESERVE_FLUSH_ALL);
+ if (ret) {
+ if (*qgroup_reserved)
+ btrfs_qgroup_free(root, *qgroup_reserved);
+ }
+
+ return ret;
+}
+
+void btrfs_subvolume_release_metadata(struct btrfs_root *root,
+ struct btrfs_block_rsv *rsv,
+ u64 qgroup_reserved)
+{
+ btrfs_block_rsv_release(root, rsv, (u64)-1);
+ if (qgroup_reserved)
+ btrfs_qgroup_free(root, qgroup_reserved);
}
/**
@@ -4532,17 +4704,29 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
u64 csum_bytes;
unsigned nr_extents = 0;
int extra_reserve = 0;
- int flush = 1;
- int ret;
+ enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
+ int ret = 0;
+ bool delalloc_lock = true;
+ u64 to_free = 0;
+ unsigned dropped;
- /* Need to be holding the i_mutex here if we aren't free space cache */
- if (btrfs_is_free_space_inode(inode))
- flush = 0;
+ /* If we are a free space inode we need to not flush since we will be in
+ * the middle of a transaction commit. We also don't need the delalloc
+ * mutex since we won't race with anybody. We need this mostly to make
+ * lockdep shut its filthy mouth.
+ */
+ if (btrfs_is_free_space_inode(inode)) {
+ flush = BTRFS_RESERVE_NO_FLUSH;
+ delalloc_lock = false;
+ }
- if (flush && btrfs_transaction_in_commit(root->fs_info))
+ if (flush != BTRFS_RESERVE_NO_FLUSH &&
+ btrfs_transaction_in_commit(root->fs_info))
schedule_timeout(1);
- mutex_lock(&BTRFS_I(inode)->delalloc_mutex);
+ if (delalloc_lock)
+ mutex_lock(&BTRFS_I(inode)->delalloc_mutex);
+
num_bytes = ALIGN(num_bytes, root->sectorsize);
spin_lock(&BTRFS_I(inode)->lock);
@@ -4571,44 +4755,16 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
if (root->fs_info->quota_enabled) {
ret = btrfs_qgroup_reserve(root, num_bytes +
nr_extents * root->leafsize);
- if (ret) {
- mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
- return ret;
- }
+ if (ret)
+ goto out_fail;
}
ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
- if (ret) {
- u64 to_free = 0;
- unsigned dropped;
-
- spin_lock(&BTRFS_I(inode)->lock);
- dropped = drop_outstanding_extent(inode);
- /*
- * If the inodes csum_bytes is the same as the original
- * csum_bytes then we know we haven't raced with any free()ers
- * so we can just reduce our inodes csum bytes and carry on.
- * Otherwise we have to do the normal free thing to account for
- * the case that the free side didn't free up its reserve
- * because of this outstanding reservation.
- */
- if (BTRFS_I(inode)->csum_bytes == csum_bytes)
- calc_csum_metadata_size(inode, num_bytes, 0);
- else
- to_free = calc_csum_metadata_size(inode, num_bytes, 0);
- spin_unlock(&BTRFS_I(inode)->lock);
- if (dropped)
- to_free += btrfs_calc_trans_metadata_size(root, dropped);
-
- if (to_free) {
- btrfs_block_rsv_release(root, block_rsv, to_free);
- trace_btrfs_space_reservation(root->fs_info,
- "delalloc",
- btrfs_ino(inode),
- to_free, 0);
- }
- mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
- return ret;
+ if (unlikely(ret)) {
+ if (root->fs_info->quota_enabled)
+ btrfs_qgroup_free(root, num_bytes +
+ nr_extents * root->leafsize);
+ goto out_fail;
}
spin_lock(&BTRFS_I(inode)->lock);
@@ -4619,7 +4775,9 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
}
BTRFS_I(inode)->reserved_extents += nr_extents;
spin_unlock(&BTRFS_I(inode)->lock);
- mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
+
+ if (delalloc_lock)
+ mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
if (to_reserve)
trace_btrfs_space_reservation(root->fs_info,"delalloc",
@@ -4627,6 +4785,34 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
block_rsv_add_bytes(block_rsv, to_reserve, 1);
return 0;
+
+out_fail:
+ spin_lock(&BTRFS_I(inode)->lock);
+ dropped = drop_outstanding_extent(inode);
+ /*
+ * If the inodes csum_bytes is the same as the original
+ * csum_bytes then we know we haven't raced with any free()ers
+ * so we can just reduce our inodes csum bytes and carry on.
+ * Otherwise we have to do the normal free thing to account for
+ * the case that the free side didn't free up its reserve
+ * because of this outstanding reservation.
+ */
+ if (BTRFS_I(inode)->csum_bytes == csum_bytes)
+ calc_csum_metadata_size(inode, num_bytes, 0);
+ else
+ to_free = calc_csum_metadata_size(inode, num_bytes, 0);
+ spin_unlock(&BTRFS_I(inode)->lock);
+ if (dropped)
+ to_free += btrfs_calc_trans_metadata_size(root, dropped);
+
+ if (to_free) {
+ btrfs_block_rsv_release(root, block_rsv, to_free);
+ trace_btrfs_space_reservation(root->fs_info, "delalloc",
+ btrfs_ino(inode), to_free, 0);
+ }
+ if (delalloc_lock)
+ mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
+ return ret;
}
/**
@@ -4648,7 +4834,8 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
spin_lock(&BTRFS_I(inode)->lock);
dropped = drop_outstanding_extent(inode);
- to_free = calc_csum_metadata_size(inode, num_bytes, 0);
+ if (num_bytes)
+ to_free = calc_csum_metadata_size(inode, num_bytes, 0);
spin_unlock(&BTRFS_I(inode)->lock);
if (dropped > 0)
to_free += btrfs_calc_trans_metadata_size(root, dropped);
@@ -4715,8 +4902,7 @@ void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
btrfs_free_reserved_data_space(inode, num_bytes);
}
-static int update_block_group(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
+static int update_block_group(struct btrfs_root *root,
u64 bytenr, u64 num_bytes, int alloc)
{
struct btrfs_block_group_cache *cache = NULL;
@@ -4753,7 +4939,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
* space back to the block group, otherwise we will leak space.
*/
if (!alloc && cache->cached == BTRFS_CACHE_NO)
- cache_block_group(cache, trans, NULL, 1);
+ cache_block_group(cache, 1);
byte_in_group = bytenr - cache->key.objectid;
WARN_ON(byte_in_group > cache->key.offset);
@@ -4803,6 +4989,13 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
struct btrfs_block_group_cache *cache;
u64 bytenr;
+ spin_lock(&root->fs_info->block_group_cache_lock);
+ bytenr = root->fs_info->first_logical_byte;
+ spin_unlock(&root->fs_info->block_group_cache_lock);
+
+ if (bytenr < (u64)-1)
+ return bytenr;
+
cache = btrfs_lookup_first_block_group(root->fs_info, search_start);
if (!cache)
return 0;
@@ -4853,8 +5046,7 @@ int btrfs_pin_extent(struct btrfs_root *root,
/*
* this function must be called within transaction
*/
-int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
+int btrfs_pin_extent_for_log_replay(struct btrfs_root *root,
u64 bytenr, u64 num_bytes)
{
struct btrfs_block_group_cache *cache;
@@ -4868,7 +5060,7 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
* to one because the slow code to read in the free extents does check
* the pinned extents.
*/
- cache_block_group(cache, trans, root, 1);
+ cache_block_group(cache, 1);
pin_down_extent(root, cache, bytenr, num_bytes, 0);
@@ -4969,9 +5161,13 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_block_group_cache *cache = NULL;
+ struct btrfs_space_info *space_info;
+ struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
u64 len;
+ bool readonly;
while (start <= end) {
+ readonly = false;
if (!cache ||
start >= cache->key.objectid + cache->key.offset) {
if (cache)
@@ -4989,15 +5185,30 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
}
start += len;
+ space_info = cache->space_info;
- spin_lock(&cache->space_info->lock);
+ spin_lock(&space_info->lock);
spin_lock(&cache->lock);
cache->pinned -= len;
- cache->space_info->bytes_pinned -= len;
- if (cache->ro)
- cache->space_info->bytes_readonly += len;
+ space_info->bytes_pinned -= len;
+ if (cache->ro) {
+ space_info->bytes_readonly += len;
+ readonly = true;
+ }
spin_unlock(&cache->lock);
- spin_unlock(&cache->space_info->lock);
+ if (!readonly && global_rsv->space_info == space_info) {
+ spin_lock(&global_rsv->lock);
+ if (!global_rsv->full) {
+ len = min(len, global_rsv->size -
+ global_rsv->reserved);
+ global_rsv->reserved += len;
+ space_info->bytes_may_use += len;
+ if (global_rsv->reserved >= global_rsv->size)
+ global_rsv->full = 1;
+ }
+ spin_unlock(&global_rsv->lock);
+ }
+ spin_unlock(&space_info->lock);
}
if (cache)
@@ -5246,7 +5457,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
}
}
- ret = update_block_group(trans, root, bytenr, num_bytes, 0);
+ ret = update_block_group(root, bytenr, num_bytes, 0);
if (ret) {
btrfs_abort_transaction(trans, extent_root, ret);
goto out;
@@ -5291,7 +5502,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
if (head->extent_op) {
if (!head->must_insert_reserved)
goto out;
- kfree(head->extent_op);
+ btrfs_free_delayed_extent_op(head->extent_op);
head->extent_op = NULL;
}
@@ -5414,10 +5625,11 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
return ret;
}
-static u64 stripe_align(struct btrfs_root *root, u64 val)
+static u64 stripe_align(struct btrfs_root *root,
+ struct btrfs_block_group_cache *cache,
+ u64 val, u64 num_bytes)
{
- u64 mask = ((u64)root->stripesize - 1);
- u64 ret = (val + mask) & ~mask;
+ u64 ret = ALIGN(val, root->stripesize);
return ret;
}
@@ -5437,7 +5649,6 @@ wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
u64 num_bytes)
{
struct btrfs_caching_control *caching_ctl;
- DEFINE_WAIT(wait);
caching_ctl = get_caching_control(cache);
if (!caching_ctl)
@@ -5454,7 +5665,6 @@ static noinline int
wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
{
struct btrfs_caching_control *caching_ctl;
- DEFINE_WAIT(wait);
caching_ctl = get_caching_control(cache);
if (!caching_ctl)
@@ -5466,27 +5676,27 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
return 0;
}
-static int __get_block_group_index(u64 flags)
+int __get_raid_index(u64 flags)
{
- int index;
-
if (flags & BTRFS_BLOCK_GROUP_RAID10)
- index = 0;
+ return BTRFS_RAID_RAID10;
else if (flags & BTRFS_BLOCK_GROUP_RAID1)
- index = 1;
+ return BTRFS_RAID_RAID1;
else if (flags & BTRFS_BLOCK_GROUP_DUP)
- index = 2;
+ return BTRFS_RAID_DUP;
else if (flags & BTRFS_BLOCK_GROUP_RAID0)
- index = 3;
- else
- index = 4;
+ return BTRFS_RAID_RAID0;
+ else if (flags & BTRFS_BLOCK_GROUP_RAID5)
+ return BTRFS_RAID_RAID5;
+ else if (flags & BTRFS_BLOCK_GROUP_RAID6)
+ return BTRFS_RAID_RAID6;
- return index;
+ return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */
}
static int get_block_group_index(struct btrfs_block_group_cache *cache)
{
- return __get_block_group_index(cache->flags);
+ return __get_raid_index(cache->flags);
}
enum btrfs_loop_type {
@@ -5519,7 +5729,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
int empty_cluster = 2 * 1024 * 1024;
struct btrfs_space_info *space_info;
int loop = 0;
- int index = 0;
+ int index = __get_raid_index(data);
int alloc_type = (data & BTRFS_BLOCK_GROUP_DATA) ?
RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC;
bool found_uncached_bg = false;
@@ -5624,6 +5834,8 @@ search:
if (!block_group_bits(block_group, data)) {
u64 extra = BTRFS_BLOCK_GROUP_DUP |
BTRFS_BLOCK_GROUP_RAID1 |
+ BTRFS_BLOCK_GROUP_RAID5 |
+ BTRFS_BLOCK_GROUP_RAID6 |
BTRFS_BLOCK_GROUP_RAID10;
/*
@@ -5639,8 +5851,7 @@ have_block_group:
cached = block_group_cache_done(block_group);
if (unlikely(!cached)) {
found_uncached_bg = true;
- ret = cache_block_group(block_group, trans,
- orig_root, 0);
+ ret = cache_block_group(block_group, 0);
BUG_ON(ret < 0);
ret = 0;
}
@@ -5653,6 +5864,7 @@ have_block_group:
* lets look there
*/
if (last_ptr) {
+ unsigned long aligned_cluster;
/*
* the refill lock keeps out other
* people trying to start a new cluster
@@ -5719,11 +5931,15 @@ refill_cluster:
goto unclustered_alloc;
}
+ aligned_cluster = max_t(unsigned long,
+ empty_cluster + empty_size,
+ block_group->full_stripe_len);
+
/* allocate a cluster in this block group */
ret = btrfs_find_space_cluster(trans, root,
block_group, last_ptr,
search_start, num_bytes,
- empty_cluster + empty_size);
+ aligned_cluster);
if (ret == 0) {
/*
* now pull our allocation out of this
@@ -5794,7 +6010,8 @@ unclustered_alloc:
goto loop;
}
checks:
- search_start = stripe_align(root, offset);
+ search_start = stripe_align(root, used_block_group,
+ offset, num_bytes);
/* move on to the next group */
if (search_start + num_bytes >
@@ -5945,7 +6162,7 @@ again:
if (ret == -ENOSPC) {
if (!final_tried) {
num_bytes = num_bytes >> 1;
- num_bytes = num_bytes & ~(root->sectorsize - 1);
+ num_bytes = round_down(num_bytes, root->sectorsize);
num_bytes = max(num_bytes, min_alloc_size);
if (num_bytes == min_alloc_size)
final_tried = true;
@@ -6069,7 +6286,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(path->nodes[0]);
btrfs_free_path(path);
- ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
+ ret = update_block_group(root, ins->objectid, ins->offset, 1);
if (ret) { /* -ENOENT, logic error */
printk(KERN_ERR "btrfs update block group failed for %llu "
"%llu\n", (unsigned long long)ins->objectid,
@@ -6133,7 +6350,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(leaf);
btrfs_free_path(path);
- ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
+ ret = update_block_group(root, ins->objectid, ins->offset, 1);
if (ret) { /* -ENOENT, logic error */
printk(KERN_ERR "btrfs update block group failed for %llu "
"%llu\n", (unsigned long long)ins->objectid,
@@ -6176,7 +6393,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
u64 num_bytes = ins->offset;
block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
- cache_block_group(block_group, trans, NULL, 0);
+ cache_block_group(block_group, 0);
caching_ctl = get_caching_control(block_group);
if (!caching_ctl) {
@@ -6269,7 +6486,8 @@ use_block_rsv(struct btrfs_trans_handle *trans,
block_rsv = get_block_rsv(trans, root);
if (block_rsv->size == 0) {
- ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
+ ret = reserve_metadata_bytes(root, block_rsv, blocksize,
+ BTRFS_RESERVE_NO_FLUSH);
/*
* If we couldn't reserve metadata bytes try and use some from
* the global reserve.
@@ -6289,14 +6507,16 @@ use_block_rsv(struct btrfs_trans_handle *trans,
if (!ret)
return block_rsv;
if (ret && !block_rsv->failfast) {
- static DEFINE_RATELIMIT_STATE(_rs,
- DEFAULT_RATELIMIT_INTERVAL,
- /*DEFAULT_RATELIMIT_BURST*/ 2);
- if (__ratelimit(&_rs)) {
- printk(KERN_DEBUG "btrfs: block rsv returned %d\n", ret);
- WARN_ON(1);
- }
- ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
+ if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
+ static DEFINE_RATELIMIT_STATE(_rs,
+ DEFAULT_RATELIMIT_INTERVAL * 10,
+ /*DEFAULT_RATELIMIT_BURST*/ 1);
+ if (__ratelimit(&_rs))
+ WARN(1, KERN_DEBUG
+ "btrfs: block rsv returned %d\n", ret);
+ }
+ ret = reserve_metadata_bytes(root, block_rsv, blocksize,
+ BTRFS_RESERVE_NO_FLUSH);
if (!ret) {
return block_rsv;
} else if (ret && block_rsv != global_rsv) {
@@ -6360,7 +6580,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
struct btrfs_delayed_extent_op *extent_op;
- extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
+ extent_op = btrfs_alloc_delayed_extent_op();
BUG_ON(!extent_op); /* -ENOMEM */
if (key)
memcpy(&extent_op->key, key, sizeof(extent_op->key));
@@ -6482,7 +6702,7 @@ reada:
}
/*
- * hepler to process tree block while walking down the tree.
+ * helper to process tree block while walking down the tree.
*
* when wc->stage == UPDATE_BACKREF, this function updates
* back refs for pointers in the block.
@@ -6557,7 +6777,7 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
}
/*
- * hepler to process tree block pointer.
+ * helper to process tree block pointer.
*
* when wc->stage == DROP_REFERENCE, this function checks
* reference count of the block pointed to. if the block
@@ -6695,7 +6915,7 @@ skip:
}
/*
- * hepler to process tree block while walking up the tree.
+ * helper to process tree block while walking up the tree.
*
* when wc->stage == DROP_REFERENCE, this function drops
* reference count on the block.
@@ -6746,11 +6966,13 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
&wc->flags[level]);
if (ret < 0) {
btrfs_tree_unlock_rw(eb, path->locks[level]);
+ path->locks[level] = 0;
return ret;
}
BUG_ON(wc->refs[level] == 0);
if (wc->refs[level] == 1) {
btrfs_tree_unlock_rw(eb, path->locks[level]);
+ path->locks[level] = 0;
return 1;
}
}
@@ -7161,6 +7383,7 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
root->fs_info->fs_devices->missing_devices;
stripped = BTRFS_BLOCK_GROUP_RAID0 |
+ BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 |
BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
if (num_devices == 1) {
@@ -7427,7 +7650,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
*/
target = get_restripe_target(root->fs_info, block_group->flags);
if (target) {
- index = __get_block_group_index(extended_to_chunk(target));
+ index = __get_raid_index(extended_to_chunk(target));
} else {
/*
* this is just a balance, so if we were marked as full
@@ -7439,16 +7662,16 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
index = get_block_group_index(block_group);
}
- if (index == 0) {
+ if (index == BTRFS_RAID_RAID10) {
dev_min = 4;
/* Divide by 2 */
min_free >>= 1;
- } else if (index == 1) {
+ } else if (index == BTRFS_RAID_RAID1) {
dev_min = 2;
- } else if (index == 2) {
+ } else if (index == BTRFS_RAID_DUP) {
/* Multiply by 2 */
min_free <<= 1;
- } else if (index == 3) {
+ } else if (index == BTRFS_RAID_RAID0) {
dev_min = fs_devices->rw_devices;
do_div(min_free, dev_min);
}
@@ -7461,7 +7684,8 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
* check to make sure we can actually find a chunk with enough
* space to fit our block group in.
*/
- if (device->total_bytes > device->bytes_used + min_free) {
+ if (device->total_bytes > device->bytes_used + min_free &&
+ !device->is_tgtdev_for_dev_replace) {
ret = find_free_dev_extent(device, min_free,
&dev_offset, NULL);
if (!ret)
@@ -7608,11 +7832,13 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
space_info = list_entry(info->space_info.next,
struct btrfs_space_info,
list);
- if (space_info->bytes_pinned > 0 ||
- space_info->bytes_reserved > 0 ||
- space_info->bytes_may_use > 0) {
- WARN_ON(1);
- dump_space_info(space_info, 0, 0);
+ if (btrfs_test_opt(info->tree_root, ENOSPC_DEBUG)) {
+ if (space_info->bytes_pinned > 0 ||
+ space_info->bytes_reserved > 0 ||
+ space_info->bytes_may_use > 0) {
+ WARN_ON(1);
+ dump_space_info(space_info, 0, 0);
+ }
}
list_del(&space_info->list);
kfree(space_info);
@@ -7711,7 +7937,9 @@ int btrfs_read_block_groups(struct btrfs_root *root)
btrfs_release_path(path);
cache->flags = btrfs_block_group_flags(&cache->item);
cache->sectorsize = root->sectorsize;
-
+ cache->full_stripe_len = btrfs_full_stripe_len(root,
+ &root->fs_info->mapping_tree,
+ found_key.objectid);
btrfs_init_free_space_ctl(cache);
/*
@@ -7765,6 +7993,8 @@ int btrfs_read_block_groups(struct btrfs_root *root)
if (!(get_alloc_profile(root, space_info->flags) &
(BTRFS_BLOCK_GROUP_RAID10 |
BTRFS_BLOCK_GROUP_RAID1 |
+ BTRFS_BLOCK_GROUP_RAID5 |
+ BTRFS_BLOCK_GROUP_RAID6 |
BTRFS_BLOCK_GROUP_DUP)))
continue;
/*
@@ -7840,6 +8070,9 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
cache->sectorsize = root->sectorsize;
cache->fs_info = root->fs_info;
+ cache->full_stripe_len = btrfs_full_stripe_len(root,
+ &root->fs_info->mapping_tree,
+ chunk_offset);
atomic_set(&cache->count, 1);
spin_lock_init(&cache->lock);
@@ -7889,12 +8122,14 @@ static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
u64 extra_flags = chunk_to_extended(flags) &
BTRFS_EXTENDED_PROFILE_MASK;
+ write_seqlock(&fs_info->profiles_lock);
if (flags & BTRFS_BLOCK_GROUP_DATA)
fs_info->avail_data_alloc_bits &= ~extra_flags;
if (flags & BTRFS_BLOCK_GROUP_METADATA)
fs_info->avail_metadata_alloc_bits &= ~extra_flags;
if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
fs_info->avail_system_alloc_bits &= ~extra_flags;
+ write_sequnlock(&fs_info->profiles_lock);
}
int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
@@ -7993,6 +8228,9 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
spin_lock(&root->fs_info->block_group_cache_lock);
rb_erase(&block_group->cache_node,
&root->fs_info->block_group_cache_tree);
+
+ if (root->fs_info->first_logical_byte == block_group->key.objectid)
+ root->fs_info->first_logical_byte = (u64)-1;
spin_unlock(&root->fs_info->block_group_cache_lock);
down_write(&block_group->space_info->groups_sem);
@@ -8115,7 +8353,7 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
if (end - start >= range->minlen) {
if (!block_group_cache_done(cache)) {
- ret = cache_block_group(cache, NULL, root, 0);
+ ret = cache_block_group(cache, 0);
if (!ret)
wait_block_group_cache_done(cache);
}