From 401b41e5a85a635fd9888ba8969c5006a5dbd399 Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Sat, 6 May 2017 07:17:54 +0800
Subject: btrfs: add framework to handle device flush error as a volume

This adds comments to the flush error handling part of the code, and
hopes to maintain the same logic with a framework which can be used to
handle the errors at the volume level.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 53 insertions(+), 4 deletions(-)

(limited to 'fs/btrfs/disk-io.c')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 5f678dcb20e6..bafdd2fe8f88 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3509,6 +3509,10 @@ static int write_dev_flush(struct btrfs_device *device, int wait)
 	if (wait) {
 		bio = device->flush_bio;
 		if (!bio)
+			/*
+			 * This means the alloc has failed with ENOMEM, however
+			 * here we return 0, as its not a device error.
+			 */
 			return 0;
 
 		wait_for_completion(&device->flush_wait);
@@ -3548,6 +3552,32 @@ static int write_dev_flush(struct btrfs_device *device, int wait)
 	return 0;
 }
 
+static int check_barrier_error(struct btrfs_fs_devices *fsdevs)
+{
+	int submit_flush_error = 0;
+	int dev_flush_error = 0;
+	struct btrfs_device *dev;
+	int tolerance;
+
+	list_for_each_entry_rcu(dev, &fsdevs->devices, dev_list) {
+		if (!dev->bdev) {
+			submit_flush_error++;
+			dev_flush_error++;
+			continue;
+		}
+		if (dev->last_flush_error == -ENOMEM)
+			submit_flush_error++;
+		if (dev->last_flush_error && dev->last_flush_error != -ENOMEM)
+			dev_flush_error++;
+	}
+
+	tolerance = fsdevs->fs_info->num_tolerated_disk_barrier_failures;
+	if (submit_flush_error > tolerance || dev_flush_error > tolerance)
+		return -EIO;
+
+	return 0;
+}
+
 /*
  * send an empty flush down to each device in parallel,
  * then wait for them
@@ -3575,6 +3605,7 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
 		ret = write_dev_flush(dev, 0);
 		if (ret)
 			errors_send++;
+		dev->last_flush_error = ret;
 	}
 
 	/* wait for all the barriers */
@@ -3589,12 +3620,30 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
 			continue;
 
 		ret = write_dev_flush(dev, 1);
-		if (ret)
+		if (ret) {
+			dev->last_flush_error = ret;
 			errors_wait++;
+		}
+	}
+
+	/*
+	 * Try hard in case of flush. Lets say, in RAID1 we have
+	 * the following situation
+	 *  dev1: EIO dev2: ENOMEM
+	 * this is not a fatal error as we hope to recover from
+	 * ENOMEM in the next attempt to flush.
+	 * But the following is considered as fatal
+	 *  dev1: ENOMEM dev2: ENOMEM
+	 *  dev1: bdev == NULL dev2: ENOMEM
+	 */
+	if (errors_send || errors_wait) {
+		/*
+		 * At some point we need the status of all disks
+		 * to arrive at the volume status. So error checking
+		 * is being pushed to a separate loop.
+		 */
+		return check_barrier_error(info->fs_devices);
 	}
-	if (errors_send > info->num_tolerated_disk_barrier_failures ||
-	    errors_wait > info->num_tolerated_disk_barrier_failures)
-		return -EIO;
 	return 0;
 }
 
-- 
cgit v1.2.3


From a5ed45f8224f2c7e4ad5a9673cb50e8e3128bd88 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Thu, 11 May 2017 09:17:46 +0300
Subject: btrfs: Convert fs_info->free_chunk_space to atomic64_t

The ->free_chunk_space variable is used to track the unallocated space
and access to it is protected by a spinlock, which is not used for
anything else.  Make the code a bit self-explanatory by switching the
variable to an atomic64_t type and kill the spinlock.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
[ not a performance critical code, use of atomic type is ok ]
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h       |  3 +--
 fs/btrfs/disk-io.c     |  3 +--
 fs/btrfs/extent-tree.c |  4 +---
 fs/btrfs/volumes.c     | 26 +++++++-------------------
 4 files changed, 10 insertions(+), 26 deletions(-)

(limited to 'fs/btrfs/disk-io.c')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 4f8f75d9e839..0334452a7be1 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -748,8 +748,7 @@ struct btrfs_fs_info {
 	struct rb_root block_group_cache_tree;
 
 	/* keep track of unallocated space */
-	spinlock_t free_chunk_lock;
-	u64 free_chunk_space;
+	atomic64_t free_chunk_space;
 
 	struct extent_io_tree freed_extents[2];
 	struct extent_io_tree *pinned_extents;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index bafdd2fe8f88..9b8df5aaf76e 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2626,7 +2626,6 @@ int open_ctree(struct super_block *sb,
 	spin_lock_init(&fs_info->fs_roots_radix_lock);
 	spin_lock_init(&fs_info->delayed_iput_lock);
 	spin_lock_init(&fs_info->defrag_inodes_lock);
-	spin_lock_init(&fs_info->free_chunk_lock);
 	spin_lock_init(&fs_info->tree_mod_seq_lock);
 	spin_lock_init(&fs_info->super_lock);
 	spin_lock_init(&fs_info->qgroup_op_lock);
@@ -2667,7 +2666,7 @@ int open_ctree(struct super_block *sb,
 	fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE;
 	fs_info->metadata_ratio = 0;
 	fs_info->defrag_inodes = RB_ROOT;
-	fs_info->free_chunk_space = 0;
+	atomic64_set(&fs_info->free_chunk_space, 0);
 	fs_info->tree_mod_log = RB_ROOT;
 	fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
 	fs_info->avg_delayed_ref_runtime = NSEC_PER_SEC >> 6; /* div by 64 */
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 33d979e9ea2a..4c0d3980fe3f 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4646,9 +4646,7 @@ static int can_overcommit(struct btrfs_root *root,
 
 	used += space_info->bytes_may_use;
 
-	spin_lock(&fs_info->free_chunk_lock);
-	avail = fs_info->free_chunk_space;
-	spin_unlock(&fs_info->free_chunk_lock);
+	avail = atomic64_read(&fs_info->free_chunk_space);
 
 	/*
 	 * If we have dup, raid1 or raid10 then only half of the free
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 017b67daa3bb..e37f95976443 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2417,9 +2417,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
 	fs_info->fs_devices->total_devices++;
 	fs_info->fs_devices->total_rw_bytes += device->total_bytes;
 
-	spin_lock(&fs_info->free_chunk_lock);
-	fs_info->free_chunk_space += device->total_bytes;
-	spin_unlock(&fs_info->free_chunk_lock);
+	atomic64_add(device->total_bytes, &fs_info->free_chunk_space);
 
 	if (!blk_queue_nonrot(q))
 		fs_info->fs_devices->rotating = 1;
@@ -2874,9 +2872,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
 			mutex_lock(&fs_info->chunk_mutex);
 			btrfs_device_set_bytes_used(device,
 					device->bytes_used - dev_extent_len);
-			spin_lock(&fs_info->free_chunk_lock);
-			fs_info->free_chunk_space += dev_extent_len;
-			spin_unlock(&fs_info->free_chunk_lock);
+			atomic64_add(dev_extent_len, &fs_info->free_chunk_space);
 			btrfs_clear_space_info_full(fs_info);
 			mutex_unlock(&fs_info->chunk_mutex);
 		}
@@ -4409,9 +4405,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
 	btrfs_device_set_total_bytes(device, new_size);
 	if (device->writeable) {
 		device->fs_devices->total_rw_bytes -= diff;
-		spin_lock(&fs_info->free_chunk_lock);
-		fs_info->free_chunk_space -= diff;
-		spin_unlock(&fs_info->free_chunk_lock);
+		atomic64_sub(diff, &fs_info->free_chunk_space);
 	}
 	mutex_unlock(&fs_info->chunk_mutex);
 
@@ -4535,9 +4529,7 @@ done:
 		btrfs_device_set_total_bytes(device, old_size);
 		if (device->writeable)
 			device->fs_devices->total_rw_bytes += diff;
-		spin_lock(&fs_info->free_chunk_lock);
-		fs_info->free_chunk_space += diff;
-		spin_unlock(&fs_info->free_chunk_lock);
+		atomic64_add(diff, &fs_info->free_chunk_space);
 		mutex_unlock(&fs_info->chunk_mutex);
 	}
 	return ret;
@@ -4882,9 +4874,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 		btrfs_device_set_bytes_used(map->stripes[i].dev, num_bytes);
 	}
 
-	spin_lock(&info->free_chunk_lock);
-	info->free_chunk_space -= (stripe_size * map->num_stripes);
-	spin_unlock(&info->free_chunk_lock);
+	atomic64_sub(stripe_size * map->num_stripes, &info->free_chunk_space);
 
 	free_extent_map(em);
 	check_raid56_incompat_flag(info, type);
@@ -6684,10 +6674,8 @@ static int read_one_dev(struct btrfs_fs_info *fs_info,
 	device->in_fs_metadata = 1;
 	if (device->writeable && !device->is_tgtdev_for_dev_replace) {
 		device->fs_devices->total_rw_bytes += device->total_bytes;
-		spin_lock(&fs_info->free_chunk_lock);
-		fs_info->free_chunk_space += device->total_bytes -
-			device->bytes_used;
-		spin_unlock(&fs_info->free_chunk_lock);
+		atomic64_add(device->total_bytes - device->bytes_used,
+				&fs_info->free_chunk_space);
 	}
 	ret = 0;
 	return ret;
-- 
cgit v1.2.3


From c6100a4b4e3d1650deafda45e49571b83270c714 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 5 May 2017 11:57:13 -0400
Subject: Btrfs: replace tree->mapping with tree->private_data

For extent_io tree's we have carried the address_mapping of the inode
around in the io tree in order to pull the inode back out for calling
into various tree ops hooks.  This works fine when everything that has
an extent_io_tree has an inode.  But we are going to remove the
btree_inode, so we need to change this.  Instead just have a generic
void * for private data that we can initialize with, and have all the
tree ops use that instead.  This had a lot of cascading changes but
should be relatively straightforward.

Signed-off-by: Josef Bacik <jbacik@fb.com>
Reviewed-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ minor reordering of the callback prototypes ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h                 |  1 +
 fs/btrfs/disk-io.c               | 50 ++++++++++++++----------
 fs/btrfs/disk-io.h               |  6 +--
 fs/btrfs/extent_io.c             | 52 ++++++++-----------------
 fs/btrfs/extent_io.h             | 21 +++++-----
 fs/btrfs/inode.c                 | 82 +++++++++++++++++++++++++++++++---------
 fs/btrfs/relocation.c            |  3 +-
 fs/btrfs/tests/extent-io-tests.c |  2 +-
 fs/btrfs/transaction.c           |  2 +-
 9 files changed, 128 insertions(+), 91 deletions(-)

(limited to 'fs/btrfs/disk-io.c')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index aea1b3cee887..c457cb177340 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3172,6 +3172,7 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
 int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
 			 size_t size, struct bio *bio,
 			 unsigned long bio_flags);
+void btrfs_set_range_writeback(void *private_data, u64 start, u64 end);
 int btrfs_page_mkwrite(struct vm_fault *vmf);
 int btrfs_readpage(struct file *file, struct page *page);
 void btrfs_evict_inode(struct inode *inode);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 9b8df5aaf76e..d8ad30fdeee6 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -118,7 +118,8 @@ void btrfs_end_io_wq_exit(void)
  * just before they are sent down the IO stack.
  */
 struct async_submit_bio {
-	struct inode *inode;
+	void *private_data;
+	struct btrfs_fs_info *fs_info;
 	struct bio *bio;
 	struct list_head list;
 	extent_submit_bio_hook_t *submit_bio_start;
@@ -871,7 +872,7 @@ static void run_one_async_start(struct btrfs_work *work)
 	int ret;
 
 	async = container_of(work, struct  async_submit_bio, work);
-	ret = async->submit_bio_start(async->inode, async->bio,
+	ret = async->submit_bio_start(async->private_data, async->bio,
 				      async->mirror_num, async->bio_flags,
 				      async->bio_offset);
 	if (ret)
@@ -885,7 +886,7 @@ static void run_one_async_done(struct btrfs_work *work)
 	int limit;
 
 	async = container_of(work, struct  async_submit_bio, work);
-	fs_info = BTRFS_I(async->inode)->root->fs_info;
+	fs_info = async->fs_info;
 
 	limit = btrfs_async_submit_limit(fs_info);
 	limit = limit * 2 / 3;
@@ -904,7 +905,7 @@ static void run_one_async_done(struct btrfs_work *work)
 		return;
 	}
 
-	async->submit_bio_done(async->inode, async->bio, async->mirror_num,
+	async->submit_bio_done(async->private_data, async->bio, async->mirror_num,
 			       async->bio_flags, async->bio_offset);
 }
 
@@ -916,10 +917,9 @@ static void run_one_async_free(struct btrfs_work *work)
 	kfree(async);
 }
 
-int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
-			struct bio *bio, int mirror_num,
-			unsigned long bio_flags,
-			u64 bio_offset,
+int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
+			int mirror_num, unsigned long bio_flags,
+			u64 bio_offset, void *private_data,
 			extent_submit_bio_hook_t *submit_bio_start,
 			extent_submit_bio_hook_t *submit_bio_done)
 {
@@ -929,7 +929,8 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
 	if (!async)
 		return -ENOMEM;
 
-	async->inode = inode;
+	async->private_data = private_data;
+	async->fs_info = fs_info;
 	async->bio = bio;
 	async->mirror_num = mirror_num;
 	async->submit_bio_start = submit_bio_start;
@@ -975,7 +976,7 @@ static int btree_csum_one_bio(struct bio *bio)
 	return ret;
 }
 
-static int __btree_submit_bio_start(struct inode *inode, struct bio *bio,
+static int __btree_submit_bio_start(void *private_data, struct bio *bio,
 				    int mirror_num, unsigned long bio_flags,
 				    u64 bio_offset)
 {
@@ -986,10 +987,11 @@ static int __btree_submit_bio_start(struct inode *inode, struct bio *bio,
 	return btree_csum_one_bio(bio);
 }
 
-static int __btree_submit_bio_done(struct inode *inode, struct bio *bio,
+static int __btree_submit_bio_done(void *private_data, struct bio *bio,
 				 int mirror_num, unsigned long bio_flags,
 				 u64 bio_offset)
 {
+	struct inode *inode = private_data;
 	int ret;
 
 	/*
@@ -1015,10 +1017,11 @@ static int check_async_write(unsigned long bio_flags)
 	return 1;
 }
 
-static int btree_submit_bio_hook(struct inode *inode, struct bio *bio,
+static int btree_submit_bio_hook(void *private_data, struct bio *bio,
 				 int mirror_num, unsigned long bio_flags,
 				 u64 bio_offset)
 {
+	struct inode *inode = private_data;
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 	int async = check_async_write(bio_flags);
 	int ret;
@@ -1043,8 +1046,8 @@ static int btree_submit_bio_hook(struct inode *inode, struct bio *bio,
 		 * kthread helpers are used to submit writes so that
 		 * checksumming can happen in parallel across all CPUs
 		 */
-		ret = btrfs_wq_submit_bio(fs_info, inode, bio, mirror_num, 0,
-					  bio_offset,
+		ret = btrfs_wq_submit_bio(fs_info, bio, mirror_num, 0,
+					  bio_offset, private_data,
 					  __btree_submit_bio_start,
 					  __btree_submit_bio_done);
 	}
@@ -1347,8 +1350,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
 	root->log_transid_committed = -1;
 	root->last_log_commit = 0;
 	if (!dummy)
-		extent_io_tree_init(&root->dirty_log_pages,
-				     fs_info->btree_inode->i_mapping);
+		extent_io_tree_init(&root->dirty_log_pages, NULL);
 
 	memset(&root->root_key, 0, sizeof(root->root_key));
 	memset(&root->root_item, 0, sizeof(root->root_item));
@@ -2309,7 +2311,7 @@ static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info)
 	inode->i_mapping->a_ops = &btree_aops;
 
 	RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
-	extent_io_tree_init(&BTRFS_I(inode)->io_tree, inode->i_mapping);
+	extent_io_tree_init(&BTRFS_I(inode)->io_tree, inode);
 	BTRFS_I(inode)->io_tree.track_uptodate = 0;
 	extent_map_tree_init(&BTRFS_I(inode)->extent_tree);
 
@@ -2703,10 +2705,8 @@ int open_ctree(struct super_block *sb,
 	fs_info->block_group_cache_tree = RB_ROOT;
 	fs_info->first_logical_byte = (u64)-1;
 
-	extent_io_tree_init(&fs_info->freed_extents[0],
-			     fs_info->btree_inode->i_mapping);
-	extent_io_tree_init(&fs_info->freed_extents[1],
-			     fs_info->btree_inode->i_mapping);
+	extent_io_tree_init(&fs_info->freed_extents[0], NULL);
+	extent_io_tree_init(&fs_info->freed_extents[1], NULL);
 	fs_info->pinned_extents = &fs_info->freed_extents[0];
 	set_bit(BTRFS_FS_BARRIER, &fs_info->flags);
 
@@ -4686,6 +4686,12 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info)
 	return 0;
 }
 
+static struct btrfs_fs_info *btree_fs_info(void *private_data)
+{
+	struct inode *inode = private_data;
+	return btrfs_sb(inode->i_sb);
+}
+
 static const struct extent_io_ops btree_extent_io_ops = {
 	/* mandatory callbacks */
 	.submit_bio_hook = btree_submit_bio_hook,
@@ -4693,6 +4699,8 @@ static const struct extent_io_ops btree_extent_io_ops = {
 	/* note we're sharing with inode.c for the merge bio hook */
 	.merge_bio_hook = btrfs_merge_bio_hook,
 	.readpage_io_failed_hook = btree_io_failed_hook,
+	.set_range_writeback = btrfs_set_range_writeback,
+	.tree_fs_info = btree_fs_info,
 
 	/* optional callbacks */
 };
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 21f1ceb85b76..35ddfcf04ad0 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -120,9 +120,9 @@ u32 btrfs_csum_data(const char *data, u32 seed, size_t len);
 void btrfs_csum_final(u32 crc, u8 *result);
 int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
 			enum btrfs_wq_endio_type metadata);
-int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
-			struct bio *bio, int mirror_num,
-			unsigned long bio_flags, u64 bio_offset,
+int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
+			int mirror_num, unsigned long bio_flags,
+			u64 bio_offset, void *private_data,
 			extent_submit_bio_hook_t *submit_bio_start,
 			extent_submit_bio_hook_t *submit_bio_done);
 unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index d3619e010005..70832b9b9e21 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -87,19 +87,9 @@ void btrfs_leak_debug_check(void)
 static inline void __btrfs_debug_check_extent_io_range(const char *caller,
 		struct extent_io_tree *tree, u64 start, u64 end)
 {
-	struct inode *inode;
-	u64 isize;
-
-	if (!tree->mapping)
-		return;
-
-	inode = tree->mapping->host;
-	isize = i_size_read(inode);
-	if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
-		btrfs_debug_rl(BTRFS_I(inode)->root->fs_info,
-		    "%s: ino %llu isize %llu odd range [%llu,%llu]",
-			caller, btrfs_ino(BTRFS_I(inode)), isize, start, end);
-	}
+	if (tree->ops && tree->ops->check_extent_io_range)
+		tree->ops->check_extent_io_range(tree->private_data, caller,
+						 start, end);
 }
 #else
 #define btrfs_leak_debug_add(new, head)	do {} while (0)
@@ -154,9 +144,9 @@ static noinline void flush_write_bio(void *data);
 static inline struct btrfs_fs_info *
 tree_fs_info(struct extent_io_tree *tree)
 {
-	if (!tree->mapping)
-		return NULL;
-	return btrfs_sb(tree->mapping->host->i_sb);
+	if (tree->ops)
+		return tree->ops->tree_fs_info(tree->private_data);
+	return NULL;
 }
 
 int __init extent_io_init(void)
@@ -213,13 +203,13 @@ void extent_io_exit(void)
 }
 
 void extent_io_tree_init(struct extent_io_tree *tree,
-			 struct address_space *mapping)
+			 void *private_data)
 {
 	tree->state = RB_ROOT;
 	tree->ops = NULL;
 	tree->dirty_bytes = 0;
 	spin_lock_init(&tree->lock);
-	tree->mapping = mapping;
+	tree->private_data = private_data;
 }
 
 static struct extent_state *alloc_extent_state(gfp_t mask)
@@ -369,8 +359,7 @@ static void merge_cb(struct extent_io_tree *tree, struct extent_state *new,
 		     struct extent_state *other)
 {
 	if (tree->ops && tree->ops->merge_extent_hook)
-		tree->ops->merge_extent_hook(tree->mapping->host, new,
-					     other);
+		tree->ops->merge_extent_hook(tree->private_data, new, other);
 }
 
 /*
@@ -421,15 +410,14 @@ static void set_state_cb(struct extent_io_tree *tree,
 			 struct extent_state *state, unsigned *bits)
 {
 	if (tree->ops && tree->ops->set_bit_hook)
-		tree->ops->set_bit_hook(tree->mapping->host, state, bits);
+		tree->ops->set_bit_hook(tree->private_data, state, bits);
 }
 
 static void clear_state_cb(struct extent_io_tree *tree,
 			   struct extent_state *state, unsigned *bits)
 {
 	if (tree->ops && tree->ops->clear_bit_hook)
-		tree->ops->clear_bit_hook(BTRFS_I(tree->mapping->host),
-				state, bits);
+		tree->ops->clear_bit_hook(tree->private_data, state, bits);
 }
 
 static void set_state_bits(struct extent_io_tree *tree,
@@ -478,7 +466,7 @@ static void split_cb(struct extent_io_tree *tree, struct extent_state *orig,
 		     u64 split)
 {
 	if (tree->ops && tree->ops->split_extent_hook)
-		tree->ops->split_extent_hook(tree->mapping->host, orig, split);
+		tree->ops->split_extent_hook(tree->private_data, orig, split);
 }
 
 /*
@@ -1402,17 +1390,7 @@ void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end)
  */
 static void set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
 {
-	unsigned long index = start >> PAGE_SHIFT;
-	unsigned long end_index = end >> PAGE_SHIFT;
-	struct page *page;
-
-	while (index <= end_index) {
-		page = find_get_page(tree->mapping, index);
-		BUG_ON(!page); /* Pages should be in the extent_io_tree */
-		set_page_writeback(page);
-		put_page(page);
-		index++;
-	}
+	tree->ops->set_range_writeback(tree->private_data, start, end);
 }
 
 /* find the first state struct with 'bits' set after 'start', and
@@ -2431,7 +2409,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
 		"Repair Read Error: submitting new read[%#x] to this_mirror=%d, in_validation=%d",
 		read_mode, failrec->this_mirror, failrec->in_validation);
 
-	ret = tree->ops->submit_bio_hook(inode, bio, failrec->this_mirror,
+	ret = tree->ops->submit_bio_hook(tree->private_data, bio, failrec->this_mirror,
 					 failrec->bio_flags, 0);
 	if (ret) {
 		free_io_failure(BTRFS_I(inode), failrec);
@@ -2755,7 +2733,7 @@ static int __must_check submit_one_bio(struct bio *bio, int mirror_num,
 	bio_get(bio);
 
 	if (tree->ops)
-		ret = tree->ops->submit_bio_hook(page->mapping->host, bio,
+		ret = tree->ops->submit_bio_hook(tree->private_data, bio,
 					   mirror_num, bio_flags, start);
 	else
 		btrfsic_submit_bio(bio);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 1eafa2f0ede3..15ef696dda51 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -92,7 +92,7 @@ struct btrfs_inode;
 struct btrfs_io_bio;
 struct io_failure_record;
 
-typedef	int (extent_submit_bio_hook_t)(struct inode *inode, struct bio *bio,
+typedef	int (extent_submit_bio_hook_t)(void *private_data, struct bio *bio,
 				       int mirror_num, unsigned long bio_flags,
 				       u64 bio_offset);
 struct extent_io_ops {
@@ -108,32 +108,36 @@ struct extent_io_ops {
 			      size_t size, struct bio *bio,
 			      unsigned long bio_flags);
 	int (*readpage_io_failed_hook)(struct page *page, int failed_mirror);
+	struct btrfs_fs_info *(*tree_fs_info)(void *private_data);
+	void (*set_range_writeback)(void *private_data, u64 start, u64 end);
 
 	/*
 	 * Optional hooks, called if the pointer is not NULL
 	 */
-	int (*fill_delalloc)(struct inode *inode, struct page *locked_page,
+	int (*fill_delalloc)(void *private_data, struct page *locked_page,
 			     u64 start, u64 end, int *page_started,
 			     unsigned long *nr_written);
 
 	int (*writepage_start_hook)(struct page *page, u64 start, u64 end);
 	void (*writepage_end_io_hook)(struct page *page, u64 start, u64 end,
 				      struct extent_state *state, int uptodate);
-	void (*set_bit_hook)(struct inode *inode, struct extent_state *state,
+	void (*set_bit_hook)(void *private_data, struct extent_state *state,
 			     unsigned *bits);
-	void (*clear_bit_hook)(struct btrfs_inode *inode,
+	void (*clear_bit_hook)(void *private_data,
 			struct extent_state *state,
 			unsigned *bits);
-	void (*merge_extent_hook)(struct inode *inode,
+	void (*merge_extent_hook)(void *private_data,
 				  struct extent_state *new,
 				  struct extent_state *other);
-	void (*split_extent_hook)(struct inode *inode,
+	void (*split_extent_hook)(void *private_data,
 				  struct extent_state *orig, u64 split);
+	void (*check_extent_io_range)(void *private_data, const char *caller,
+				      u64 start, u64 end);
 };
 
 struct extent_io_tree {
 	struct rb_root state;
-	struct address_space *mapping;
+	void *private_data;
 	u64 dirty_bytes;
 	int track_uptodate;
 	spinlock_t lock;
@@ -230,8 +234,7 @@ typedef struct extent_map *(get_extent_t)(struct btrfs_inode *inode,
 					  u64 start, u64 len,
 					  int create);
 
-void extent_io_tree_init(struct extent_io_tree *tree,
-			 struct address_space *mapping);
+void extent_io_tree_init(struct extent_io_tree *tree, void *private_data);
 int try_release_extent_mapping(struct extent_map_tree *map,
 			       struct extent_io_tree *tree, struct page *page,
 			       gfp_t mask);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index ada36206737f..54aa757d4cc3 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1569,10 +1569,11 @@ static inline int need_force_cow(struct inode *inode, u64 start, u64 end)
 /*
  * extent_io.c call back to do delayed allocation processing
  */
-static int run_delalloc_range(struct inode *inode, struct page *locked_page,
+static int run_delalloc_range(void *private_data, struct page *locked_page,
 			      u64 start, u64 end, int *page_started,
 			      unsigned long *nr_written)
 {
+	struct inode *inode = private_data;
 	int ret;
 	int force_cow = need_force_cow(inode, start, end);
 
@@ -1596,9 +1597,10 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
 	return ret;
 }
 
-static void btrfs_split_extent_hook(struct inode *inode,
+static void btrfs_split_extent_hook(void *private_data,
 				    struct extent_state *orig, u64 split)
 {
+	struct inode *inode = private_data;
 	u64 size;
 
 	/* not delalloc, ignore it */
@@ -1633,10 +1635,11 @@ static void btrfs_split_extent_hook(struct inode *inode,
  * extents, such as when we are doing sequential writes, so we can properly
  * account for the metadata space we'll need.
  */
-static void btrfs_merge_extent_hook(struct inode *inode,
+static void btrfs_merge_extent_hook(void *private_data,
 				    struct extent_state *new,
 				    struct extent_state *other)
 {
+	struct inode *inode = private_data;
 	u64 new_size, old_size;
 	u32 num_extents;
 
@@ -1736,9 +1739,10 @@ static void btrfs_del_delalloc_inode(struct btrfs_root *root,
  * bytes in this file, and to maintain the list of inodes that
  * have pending delalloc work to be done.
  */
-static void btrfs_set_bit_hook(struct inode *inode,
+static void btrfs_set_bit_hook(void *private_data,
 			       struct extent_state *state, unsigned *bits)
 {
+	struct inode *inode = private_data;
 
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 
@@ -1790,10 +1794,11 @@ static void btrfs_set_bit_hook(struct inode *inode,
 /*
  * extent_io.c clear_bit_hook, see set_bit_hook for why
  */
-static void btrfs_clear_bit_hook(struct btrfs_inode *inode,
+static void btrfs_clear_bit_hook(void *private_data,
 				 struct extent_state *state,
 				 unsigned *bits)
 {
+	struct btrfs_inode *inode = BTRFS_I((struct inode *)private_data);
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
 	u64 len = state->end + 1 - state->start;
 	u32 num_extents = count_max_extents(len);
@@ -1901,10 +1906,11 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
  * At IO completion time the cums attached on the ordered extent record
  * are inserted into the btree
  */
-static int __btrfs_submit_bio_start(struct inode *inode, struct bio *bio,
+static int __btrfs_submit_bio_start(void *private_data, struct bio *bio,
 				    int mirror_num, unsigned long bio_flags,
 				    u64 bio_offset)
 {
+	struct inode *inode = private_data;
 	int ret = 0;
 
 	ret = btrfs_csum_one_bio(inode, bio, 0, 0);
@@ -1920,10 +1926,11 @@ static int __btrfs_submit_bio_start(struct inode *inode, struct bio *bio,
  * At IO completion time the cums attached on the ordered extent record
  * are inserted into the btree
  */
-static int __btrfs_submit_bio_done(struct inode *inode, struct bio *bio,
+static int __btrfs_submit_bio_done(void *private_data, struct bio *bio,
 			  int mirror_num, unsigned long bio_flags,
 			  u64 bio_offset)
 {
+	struct inode *inode = private_data;
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 	int ret;
 
@@ -1939,10 +1946,11 @@ static int __btrfs_submit_bio_done(struct inode *inode, struct bio *bio,
  * extent_io.c submission hook. This does the right thing for csum calculation
  * on write, or reading the csums from the tree before a read
  */
-static int btrfs_submit_bio_hook(struct inode *inode, struct bio *bio,
-			  int mirror_num, unsigned long bio_flags,
-			  u64 bio_offset)
+static int btrfs_submit_bio_hook(void *private_data, struct bio *bio,
+				 int mirror_num, unsigned long bio_flags,
+				 u64 bio_offset)
 {
+	struct inode *inode = private_data;
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	enum btrfs_wq_endio_type metadata = BTRFS_WQ_ENDIO_DATA;
@@ -1976,8 +1984,8 @@ static int btrfs_submit_bio_hook(struct inode *inode, struct bio *bio,
 		if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
 			goto mapit;
 		/* we're doing a write, do the async checksumming */
-		ret = btrfs_wq_submit_bio(fs_info, inode, bio, mirror_num,
-					  bio_flags, bio_offset,
+		ret = btrfs_wq_submit_bio(fs_info, bio, mirror_num, bio_flags,
+					  bio_offset, inode,
 					  __btrfs_submit_bio_start,
 					  __btrfs_submit_bio_done);
 		goto out;
@@ -8306,10 +8314,11 @@ static void btrfs_endio_direct_write(struct bio *bio)
 	bio_put(bio);
 }
 
-static int __btrfs_submit_bio_start_direct_io(struct inode *inode,
+static int __btrfs_submit_bio_start_direct_io(void *private_data,
 				    struct bio *bio, int mirror_num,
 				    unsigned long bio_flags, u64 offset)
 {
+	struct inode *inode = private_data;
 	int ret;
 	ret = btrfs_csum_one_bio(inode, bio, offset, 1);
 	BUG_ON(ret); /* -ENOMEM */
@@ -8421,8 +8430,8 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
 		goto map;
 
 	if (write && async_submit) {
-		ret = btrfs_wq_submit_bio(fs_info, inode, bio, 0, 0,
-					  file_offset,
+		ret = btrfs_wq_submit_bio(fs_info, bio, 0, 0,
+					  file_offset, inode,
 					  __btrfs_submit_bio_start_direct_io,
 					  __btrfs_submit_bio_done);
 		goto err;
@@ -9402,8 +9411,8 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
 
 	inode = &ei->vfs_inode;
 	extent_map_tree_init(&ei->extent_tree);
-	extent_io_tree_init(&ei->io_tree, &inode->i_data);
-	extent_io_tree_init(&ei->io_failure_tree, &inode->i_data);
+	extent_io_tree_init(&ei->io_tree, inode);
+	extent_io_tree_init(&ei->io_failure_tree, inode);
 	ei->io_tree.track_uptodate = 1;
 	ei->io_failure_tree.track_uptodate = 1;
 	atomic_set(&ei->sync_writers, 0);
@@ -10657,6 +10666,42 @@ static int btrfs_readpage_io_failed_hook(struct page *page, int failed_mirror)
 	return -EAGAIN;
 }
 
+static struct btrfs_fs_info *iotree_fs_info(void *private_data)
+{
+	struct inode *inode = private_data;
+	return btrfs_sb(inode->i_sb);
+}
+
+static void btrfs_check_extent_io_range(void *private_data, const char *caller,
+					u64 start, u64 end)
+{
+	struct inode *inode = private_data;
+	u64 isize;
+
+	isize = i_size_read(inode);
+	if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
+		btrfs_debug_rl(BTRFS_I(inode)->root->fs_info,
+		    "%s: ino %llu isize %llu odd range [%llu,%llu]",
+			caller, btrfs_ino(BTRFS_I(inode)), isize, start, end);
+	}
+}
+
+void btrfs_set_range_writeback(void *private_data, u64 start, u64 end)
+{
+	struct inode *inode = private_data;
+	unsigned long index = start >> PAGE_SHIFT;
+	unsigned long end_index = end >> PAGE_SHIFT;
+	struct page *page;
+
+	while (index <= end_index) {
+		page = find_get_page(inode->i_mapping, index);
+		ASSERT(page); /* Pages should be in the extent_io_tree */
+		set_page_writeback(page);
+		put_page(page);
+		index++;
+	}
+}
+
 static const struct inode_operations btrfs_dir_inode_operations = {
 	.getattr	= btrfs_getattr,
 	.lookup		= btrfs_lookup,
@@ -10700,6 +10745,8 @@ static const struct extent_io_ops btrfs_extent_io_ops = {
 	.readpage_end_io_hook = btrfs_readpage_end_io_hook,
 	.merge_bio_hook = btrfs_merge_bio_hook,
 	.readpage_io_failed_hook = btrfs_readpage_io_failed_hook,
+	.tree_fs_info = iotree_fs_info,
+	.set_range_writeback = btrfs_set_range_writeback,
 
 	/* optional callbacks */
 	.fill_delalloc = run_delalloc_range,
@@ -10709,6 +10756,7 @@ static const struct extent_io_ops btrfs_extent_io_ops = {
 	.clear_bit_hook = btrfs_clear_bit_hook,
 	.merge_extent_hook = btrfs_merge_extent_hook,
 	.split_extent_hook = btrfs_split_extent_hook,
+	.check_extent_io_range = btrfs_check_extent_io_range,
 };
 
 /*
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index d60df51959f7..b291d1bebb4c 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -4269,8 +4269,7 @@ static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info)
 	INIT_LIST_HEAD(&rc->reloc_roots);
 	backref_cache_init(&rc->backref_cache);
 	mapping_tree_init(&rc->reloc_root_tree);
-	extent_io_tree_init(&rc->processed_blocks,
-			    fs_info->btree_inode->i_mapping);
+	extent_io_tree_init(&rc->processed_blocks, NULL);
 	return rc;
 }
 
diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c
index 133753232a94..d06b1c931d05 100644
--- a/fs/btrfs/tests/extent-io-tests.c
+++ b/fs/btrfs/tests/extent-io-tests.c
@@ -87,7 +87,7 @@ static int test_find_delalloc(u32 sectorsize)
 		return -ENOMEM;
 	}
 
-	extent_io_tree_init(&tmp, &inode->i_data);
+	extent_io_tree_init(&tmp, inode);
 
 	/*
 	 * First go through and create and mark all of our pages dirty, we pin
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 2168654c90a1..ca0009ff47f1 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -294,7 +294,7 @@ loop:
 	spin_lock_init(&cur_trans->dropped_roots_lock);
 	list_add_tail(&cur_trans->list, &fs_info->trans_list);
 	extent_io_tree_init(&cur_trans->dirty_pages,
-			     fs_info->btree_inode->i_mapping);
+			     fs_info->btree_inode);
 	fs_info->generation++;
 	cur_trans->transid = fs_info->generation;
 	fs_info->running_transaction = cur_trans;
-- 
cgit v1.2.3


From b297c9f68f4ac61c259dbe529ceb74910f25e281 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 13 Apr 2017 19:11:04 +0200
Subject: btrfs: remove unused member list from async_submit_bio

The list used to track checksums in the early version (2.6.29), but I
was able not pinpoint the commit that stopped using it. Everything
apparently works without it for a long time.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs/btrfs/disk-io.c')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index d8ad30fdeee6..f896fa1dc070 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -121,7 +121,6 @@ struct async_submit_bio {
 	void *private_data;
 	struct btrfs_fs_info *fs_info;
 	struct bio *bio;
-	struct list_head list;
 	extent_submit_bio_hook_t *submit_bio_start;
 	extent_submit_bio_hook_t *submit_bio_done;
 	int mirror_num;
-- 
cgit v1.2.3


From c9fed2bb61c4050ff01c4337e880a546fde5677d Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 13 Apr 2017 19:11:04 +0200
Subject: btrfs: remove unused member list from btrfs_end_io_wq

The end io work queue items have been tracked by the work queues since
"Btrfs: Add async worker threads for pre and post IO checksumming"
(8b7128429235d9bd72cfd5e) (2008).

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs/btrfs/disk-io.c')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index f896fa1dc070..bfeb5c12e3ff 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -89,7 +89,6 @@ struct btrfs_end_io_wq {
 	struct btrfs_fs_info *info;
 	int error;
 	enum btrfs_wq_endio_type metadata;
-	struct list_head list;
 	struct btrfs_work work;
 };
 
-- 
cgit v1.2.3


From 3189ff778630d9ce8a9f2fda9f8ae4510cceb154 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Thu, 25 May 2017 06:39:52 -0400
Subject: btrfs: btrfs_wait_tree_block_writeback can be void return

Nothing checks its return value.

Is it safe to skip checking return value of btrfs_wait_tree_block_writeback?

Liu Bo: I think yes, it's used in walk_log_tree which is called in two
places, free_log_tree and log replay.  For free_log_tree, it waits for
any running writeback of the extent buffer under freeing to finish in
case we need to access the eb pointer from page->private, and it's OK to
not check the return value, while for log replay, it's doesn't wait
because wc->wait is not set. So neither cares about the writeback error.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
[ added more explanation to changelog, from Liu Bo ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c | 6 +++---
 fs/btrfs/disk-io.h | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs/btrfs/disk-io.c')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index bfeb5c12e3ff..cdc28ad7f1ed 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1223,10 +1223,10 @@ int btrfs_write_tree_block(struct extent_buffer *buf)
 					buf->start + buf->len - 1);
 }
 
-int btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
+void btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
 {
-	return filemap_fdatawait_range(buf->pages[0]->mapping,
-				       buf->start, buf->start + buf->len - 1);
+	filemap_fdatawait_range(buf->pages[0]->mapping,
+			        buf->start, buf->start + buf->len - 1);
 }
 
 struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 35ddfcf04ad0..4654d129aa76 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -127,7 +127,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
 			extent_submit_bio_hook_t *submit_bio_done);
 unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info);
 int btrfs_write_tree_block(struct extent_buffer *buf);
-int btrfs_wait_tree_block_writeback(struct extent_buffer *buf);
+void btrfs_wait_tree_block_writeback(struct extent_buffer *buf);
 int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
 			     struct btrfs_fs_info *fs_info);
 int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
-- 
cgit v1.2.3


From 4b5faeac4688174fd523f2a22b7d70d5a96842fb Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Tue, 28 Mar 2017 12:06:05 +0200
Subject: btrfs: use generic slab for for btrfs_transaction

Observing the number of slab objects of btrfs_transaction, there's just
one active on an almost quiescent filesystem, and the number of objects
goes to about ten when sync is in progress. Then the nubmer goes down to
1.  This matches the expectations of the transaction lifetime.

For such use the separate slab cache is not justified, as we do not
reuse objects frequently. For the shortlived transaction, the generic
slab (size 512) should be ok. We can optimistically expect that the 512
slabs are not all used (fragmentation) and there are free slots to take
when we do the allocation, compared to potentially allocating a whole new
page for the separate slab.

We'll lose the stats about the object use, which could be added later if
we really need them.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h       | 1 -
 fs/btrfs/disk-io.c     | 5 -----
 fs/btrfs/inode.c       | 8 --------
 fs/btrfs/transaction.c | 8 ++++----
 4 files changed, 4 insertions(+), 18 deletions(-)

(limited to 'fs/btrfs/disk-io.c')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 388f1128544e..f0f5f28784b6 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -48,7 +48,6 @@ struct btrfs_trans_handle;
 struct btrfs_transaction;
 struct btrfs_pending_snapshot;
 extern struct kmem_cache *btrfs_trans_handle_cachep;
-extern struct kmem_cache *btrfs_transaction_cachep;
 extern struct kmem_cache *btrfs_bit_radix_cachep;
 extern struct kmem_cache *btrfs_path_cachep;
 extern struct kmem_cache *btrfs_free_space_cachep;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index cdc28ad7f1ed..cb79bce3a972 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -4624,11 +4624,6 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
 
 	cur_trans->state =TRANS_STATE_COMPLETED;
 	wake_up(&cur_trans->commit_wait);
-
-	/*
-	memset(cur_trans, 0, sizeof(*cur_trans));
-	kmem_cache_free(btrfs_transaction_cachep, cur_trans);
-	*/
 }
 
 static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 6b8d34136044..0a1ec5cd3b8f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -86,7 +86,6 @@ static const struct extent_io_ops btrfs_extent_io_ops;
 
 static struct kmem_cache *btrfs_inode_cachep;
 struct kmem_cache *btrfs_trans_handle_cachep;
-struct kmem_cache *btrfs_transaction_cachep;
 struct kmem_cache *btrfs_path_cachep;
 struct kmem_cache *btrfs_free_space_cachep;
 
@@ -9513,7 +9512,6 @@ void btrfs_destroy_cachep(void)
 	rcu_barrier();
 	kmem_cache_destroy(btrfs_inode_cachep);
 	kmem_cache_destroy(btrfs_trans_handle_cachep);
-	kmem_cache_destroy(btrfs_transaction_cachep);
 	kmem_cache_destroy(btrfs_path_cachep);
 	kmem_cache_destroy(btrfs_free_space_cachep);
 }
@@ -9533,12 +9531,6 @@ int btrfs_init_cachep(void)
 	if (!btrfs_trans_handle_cachep)
 		goto fail;
 
-	btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction",
-			sizeof(struct btrfs_transaction), 0,
-			SLAB_TEMPORARY | SLAB_MEM_SPREAD, NULL);
-	if (!btrfs_transaction_cachep)
-		goto fail;
-
 	btrfs_path_cachep = kmem_cache_create("btrfs_path",
 			sizeof(struct btrfs_path), 0,
 			SLAB_MEM_SPREAD, NULL);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index ca0009ff47f1..ab030fb22530 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -93,7 +93,7 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction)
 			btrfs_put_block_group_trimming(cache);
 			btrfs_put_block_group(cache);
 		}
-		kmem_cache_free(btrfs_transaction_cachep, transaction);
+		kfree(transaction);
 	}
 }
 
@@ -228,7 +228,7 @@ loop:
 	 */
 	BUG_ON(type == TRANS_JOIN_NOLOCK);
 
-	cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS);
+	cur_trans = kmalloc(sizeof(*cur_trans), GFP_NOFS);
 	if (!cur_trans)
 		return -ENOMEM;
 
@@ -238,11 +238,11 @@ loop:
 		 * someone started a transaction after we unlocked.  Make sure
 		 * to redo the checks above
 		 */
-		kmem_cache_free(btrfs_transaction_cachep, cur_trans);
+		kfree(cur_trans);
 		goto loop;
 	} else if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
 		spin_unlock(&fs_info->trans_lock);
-		kmem_cache_free(btrfs_transaction_cachep, cur_trans);
+		kfree(cur_trans);
 		return -EROFS;
 	}
 
-- 
cgit v1.2.3


From e4f56903863c793bc1d181f7f0b6244ea5753338 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Fri, 2 Jun 2017 17:38:30 +0200
Subject: btrfs: btrfs_io_bio_alloc never fails, skip error handling

Update direct callers of btrfs_io_bio_alloc that do error handling, that
we can now remove.

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/check-integrity.c |  5 -----
 fs/btrfs/disk-io.c         |  3 ---
 fs/btrfs/extent_io.c       |  5 -----
 fs/btrfs/raid56.c          |  3 ---
 fs/btrfs/scrub.c           | 25 -------------------------
 5 files changed, 41 deletions(-)

(limited to 'fs/btrfs/disk-io.c')

diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 5f8006e4de9d..160879c802d0 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -1639,11 +1639,6 @@ static int btrfsic_read_block(struct btrfsic_state *state,
 		unsigned int j;
 
 		bio = btrfs_io_bio_alloc(GFP_NOFS, num_pages - i);
-		if (!bio) {
-			pr_info("btrfsic: bio_alloc() for %u pages failed!\n",
-			       num_pages - i);
-			return -1;
-		}
 		bio->bi_bdev = block_ctx->dev->bdev;
 		bio->bi_iter.bi_sector = dev_bytenr >> 9;
 		bio_set_op_attrs(bio, REQ_OP_READ, 0);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index cb79bce3a972..9f2ffe2c6afb 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3533,9 +3533,6 @@ static int write_dev_flush(struct btrfs_device *device, int wait)
 	 */
 	device->flush_bio = NULL;
 	bio = btrfs_io_bio_alloc(GFP_NOFS, 0);
-	if (!bio)
-		return -ENOMEM;
-
 	bio->bi_end_io = btrfs_end_empty_barrier;
 	bio->bi_bdev = device->bdev;
 	bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index e15bd02e534f..bfcbe8f2818b 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1988,8 +1988,6 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
 	BUG_ON(!mirror_num);
 
 	bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
-	if (!bio)
-		return -EIO;
 	bio->bi_iter.bi_size = 0;
 	map_length = length;
 
@@ -2334,9 +2332,6 @@ struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio,
 	struct btrfs_io_bio *btrfs_bio;
 
 	bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
-	if (!bio)
-		return NULL;
-
 	bio->bi_end_io = endio_func;
 	bio->bi_iter.bi_sector = failrec->logical >> 9;
 	bio->bi_bdev = fs_info->fs_devices->latest_bdev;
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index d68af3c61b49..7dd55448ac68 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -1099,9 +1099,6 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
 
 	/* put a new bio on the list */
 	bio = btrfs_io_bio_alloc(GFP_NOFS, bio_max_len >> PAGE_SHIFT?:1);
-	if (!bio)
-		return -ENOMEM;
-
 	bio->bi_iter.bi_size = 0;
 	bio->bi_bdev = stripe->dev->bdev;
 	bio->bi_iter.bi_sector = disk_start >> 9;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 096e503e3ddc..1e2dfea00b2f 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -1738,11 +1738,6 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
 
 		WARN_ON(!page->page);
 		bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
-		if (!bio) {
-			page->io_error = 1;
-			sblock->no_io_error_seen = 0;
-			continue;
-		}
 		bio->bi_bdev = page->dev->bdev;
 
 		bio_add_page(bio, page->page, PAGE_SIZE, 0);
@@ -1831,8 +1826,6 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
 		}
 
 		bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
-		if (!bio)
-			return -EIO;
 		bio->bi_bdev = page_bad->dev->bdev;
 		bio->bi_iter.bi_sector = page_bad->physical >> 9;
 		bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
@@ -1924,10 +1917,6 @@ again:
 		if (!bio) {
 			bio = btrfs_io_bio_alloc(GFP_KERNEL,
 					sctx->pages_per_wr_bio);
-			if (!bio) {
-				mutex_unlock(&sctx->wr_lock);
-				return -ENOMEM;
-			}
 			sbio->bio = bio;
 		}
 
@@ -2329,8 +2318,6 @@ again:
 		if (!bio) {
 			bio = btrfs_io_bio_alloc(GFP_KERNEL,
 					sctx->pages_per_rd_bio);
-			if (!bio)
-				return -ENOMEM;
 			sbio->bio = bio;
 		}
 
@@ -2457,9 +2444,6 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock)
 	}
 
 	bio = btrfs_io_bio_alloc(GFP_NOFS, 0);
-	if (!bio)
-		goto bbio_out;
-
 	bio->bi_iter.bi_sector = logical >> 9;
 	bio->bi_private = sblock;
 	bio->bi_end_io = scrub_missing_raid56_end_io;
@@ -3036,9 +3020,6 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
 		goto bbio_out;
 
 	bio = btrfs_io_bio_alloc(GFP_NOFS, 0);
-	if (!bio)
-		goto bbio_out;
-
 	bio->bi_iter.bi_sector = sparity->logic_start >> 9;
 	bio->bi_private = sparity;
 	bio->bi_end_io = scrub_parity_bio_endio;
@@ -4646,12 +4627,6 @@ static int write_page_nocow(struct scrub_ctx *sctx,
 		return -EIO;
 	}
 	bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
-	if (!bio) {
-		spin_lock(&sctx->stat_lock);
-		sctx->stat.malloc_errors++;
-		spin_unlock(&sctx->stat_lock);
-		return -ENOMEM;
-	}
 	bio->bi_iter.bi_size = 0;
 	bio->bi_iter.bi_sector = physical_for_dev_replace >> 9;
 	bio->bi_bdev = dev->bdev;
-- 
cgit v1.2.3


From c5e4c3d7503453832444475641988ffa02b88b6d Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Mon, 12 Jun 2017 17:29:41 +0200
Subject: btrfs: sink gfp parameter to btrfs_io_bio_alloc

We can hardcode GFP_NOFS to btrfs_io_bio_alloc, although it means we
change it back from GFP_KERNEL in scrub. I'd rather save a few stack
bytes from not passing the gfp flags in the remaining, more imporatant,
contexts and the bio allocating API now looks more consistent.

Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/check-integrity.c |  2 +-
 fs/btrfs/disk-io.c         |  2 +-
 fs/btrfs/extent_io.c       |  8 ++++----
 fs/btrfs/extent_io.h       |  2 +-
 fs/btrfs/raid56.c          |  2 +-
 fs/btrfs/scrub.c           | 16 +++++++---------
 6 files changed, 15 insertions(+), 17 deletions(-)

(limited to 'fs/btrfs/disk-io.c')

diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 160879c802d0..e3b1d08dd03c 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -1638,7 +1638,7 @@ static int btrfsic_read_block(struct btrfsic_state *state,
 		struct bio *bio;
 		unsigned int j;
 
-		bio = btrfs_io_bio_alloc(GFP_NOFS, num_pages - i);
+		bio = btrfs_io_bio_alloc(num_pages - i);
 		bio->bi_bdev = block_ctx->dev->bdev;
 		bio->bi_iter.bi_sector = dev_bytenr >> 9;
 		bio_set_op_attrs(bio, REQ_OP_READ, 0);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 9f2ffe2c6afb..8b57c280e5cd 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3532,7 +3532,7 @@ static int write_dev_flush(struct btrfs_device *device, int wait)
 	 * caller
 	 */
 	device->flush_bio = NULL;
-	bio = btrfs_io_bio_alloc(GFP_NOFS, 0);
+	bio = btrfs_io_bio_alloc(0);
 	bio->bi_end_io = btrfs_end_empty_barrier;
 	bio->bi_bdev = device->bdev;
 	bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index cbd0a9a1daa5..29a6111a68d2 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1987,7 +1987,7 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
 	ASSERT(!(fs_info->sb->s_flags & MS_RDONLY));
 	BUG_ON(!mirror_num);
 
-	bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
+	bio = btrfs_io_bio_alloc(1);
 	bio->bi_iter.bi_size = 0;
 	map_length = length;
 
@@ -2331,7 +2331,7 @@ struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio,
 	struct btrfs_io_bio *btrfs_failed_bio;
 	struct btrfs_io_bio *btrfs_bio;
 
-	bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
+	bio = btrfs_io_bio_alloc(1);
 	bio->bi_end_io = endio_func;
 	bio->bi_iter.bi_sector = failrec->logical >> 9;
 	bio->bi_bdev = fs_info->fs_devices->latest_bdev;
@@ -2692,12 +2692,12 @@ struct bio *btrfs_bio_clone(struct bio *bio)
 	return new;
 }
 
-struct bio *btrfs_io_bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs)
+struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs)
 {
 	struct bio *bio;
 
 	/* Bio allocation backed by a bioset does not fail */
-	bio = bio_alloc_bioset(gfp_mask, nr_iovecs, btrfs_bioset);
+	bio = bio_alloc_bioset(GFP_NOFS, nr_iovecs, btrfs_bioset);
 	btrfs_io_bio_init(btrfs_io_bio(bio));
 	return bio;
 }
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 8071e3977614..1e508a8f876e 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -463,7 +463,7 @@ void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
 				 unsigned bits_to_clear,
 				 unsigned long page_ops);
 struct bio *btrfs_bio_alloc(struct block_device *bdev, u64 first_byte);
-struct bio *btrfs_io_bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs);
+struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs);
 struct bio *btrfs_bio_clone(struct bio *bio);
 struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size);
 
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 7dd55448ac68..b9abb0b01021 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -1098,7 +1098,7 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
 	}
 
 	/* put a new bio on the list */
-	bio = btrfs_io_bio_alloc(GFP_NOFS, bio_max_len >> PAGE_SHIFT?:1);
+	bio = btrfs_io_bio_alloc(bio_max_len >> PAGE_SHIFT ?: 1);
 	bio->bi_iter.bi_size = 0;
 	bio->bi_bdev = stripe->dev->bdev;
 	bio->bi_iter.bi_sector = disk_start >> 9;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 1e2dfea00b2f..58a249cd5adc 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -1737,7 +1737,7 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
 		}
 
 		WARN_ON(!page->page);
-		bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
+		bio = btrfs_io_bio_alloc(1);
 		bio->bi_bdev = page->dev->bdev;
 
 		bio_add_page(bio, page->page, PAGE_SIZE, 0);
@@ -1825,7 +1825,7 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
 			return -EIO;
 		}
 
-		bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
+		bio = btrfs_io_bio_alloc(1);
 		bio->bi_bdev = page_bad->dev->bdev;
 		bio->bi_iter.bi_sector = page_bad->physical >> 9;
 		bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
@@ -1915,8 +1915,7 @@ again:
 		sbio->dev = sctx->wr_tgtdev;
 		bio = sbio->bio;
 		if (!bio) {
-			bio = btrfs_io_bio_alloc(GFP_KERNEL,
-					sctx->pages_per_wr_bio);
+			bio = btrfs_io_bio_alloc(sctx->pages_per_wr_bio);
 			sbio->bio = bio;
 		}
 
@@ -2316,8 +2315,7 @@ again:
 		sbio->dev = spage->dev;
 		bio = sbio->bio;
 		if (!bio) {
-			bio = btrfs_io_bio_alloc(GFP_KERNEL,
-					sctx->pages_per_rd_bio);
+			bio = btrfs_io_bio_alloc(sctx->pages_per_rd_bio);
 			sbio->bio = bio;
 		}
 
@@ -2443,7 +2441,7 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock)
 		goto bbio_out;
 	}
 
-	bio = btrfs_io_bio_alloc(GFP_NOFS, 0);
+	bio = btrfs_io_bio_alloc(0);
 	bio->bi_iter.bi_sector = logical >> 9;
 	bio->bi_private = sblock;
 	bio->bi_end_io = scrub_missing_raid56_end_io;
@@ -3019,7 +3017,7 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
 	if (ret || !bbio || !bbio->raid_map)
 		goto bbio_out;
 
-	bio = btrfs_io_bio_alloc(GFP_NOFS, 0);
+	bio = btrfs_io_bio_alloc(0);
 	bio->bi_iter.bi_sector = sparity->logic_start >> 9;
 	bio->bi_private = sparity;
 	bio->bi_end_io = scrub_parity_bio_endio;
@@ -4626,7 +4624,7 @@ static int write_page_nocow(struct scrub_ctx *sctx,
 			"scrub write_page_nocow(bdev == NULL) is unexpected");
 		return -EIO;
 	}
-	bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
+	bio = btrfs_io_bio_alloc(1);
 	bio->bi_iter.bi_size = 0;
 	bio->bi_iter.bi_sector = physical_for_dev_replace >> 9;
 	bio->bi_bdev = dev->bdev;
-- 
cgit v1.2.3


From 12b9bf0b942ecca695fe709ea754091918d88b01 Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Tue, 13 Jun 2017 17:32:29 +0800
Subject: btrfs: write_dev_flush does not return ENOMEM anymore

Since commit "btrfs: btrfs_io_bio_alloc never fails, skip error handling"
write_dev_flush will not return ENOMEM in the sending part. We do not
need to check for it in the callers.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ updated changelog ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c | 38 +++++---------------------------------
 1 file changed, 5 insertions(+), 33 deletions(-)

(limited to 'fs/btrfs/disk-io.c')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 8b57c280e5cd..1e90469cc0d2 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3505,13 +3505,6 @@ static int write_dev_flush(struct btrfs_device *device, int wait)
 
 	if (wait) {
 		bio = device->flush_bio;
-		if (!bio)
-			/*
-			 * This means the alloc has failed with ENOMEM, however
-			 * here we return 0, as its not a device error.
-			 */
-			return 0;
-
 		wait_for_completion(&device->flush_wait);
 
 		if (bio->bi_error) {
@@ -3548,25 +3541,16 @@ static int write_dev_flush(struct btrfs_device *device, int wait)
 
 static int check_barrier_error(struct btrfs_fs_devices *fsdevs)
 {
-	int submit_flush_error = 0;
 	int dev_flush_error = 0;
 	struct btrfs_device *dev;
-	int tolerance;
 
 	list_for_each_entry_rcu(dev, &fsdevs->devices, dev_list) {
-		if (!dev->bdev) {
-			submit_flush_error++;
-			dev_flush_error++;
-			continue;
-		}
-		if (dev->last_flush_error == -ENOMEM)
-			submit_flush_error++;
-		if (dev->last_flush_error && dev->last_flush_error != -ENOMEM)
+		if (!dev->bdev || dev->last_flush_error)
 			dev_flush_error++;
 	}
 
-	tolerance = fsdevs->fs_info->num_tolerated_disk_barrier_failures;
-	if (submit_flush_error > tolerance || dev_flush_error > tolerance)
+	if (dev_flush_error >
+	    fsdevs->fs_info->num_tolerated_disk_barrier_failures)
 		return -EIO;
 
 	return 0;
@@ -3596,10 +3580,8 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
 		if (!dev->in_fs_metadata || !dev->writeable)
 			continue;
 
-		ret = write_dev_flush(dev, 0);
-		if (ret)
-			errors_send++;
-		dev->last_flush_error = ret;
+		write_dev_flush(dev, 0);
+		dev->last_flush_error = 0;
 	}
 
 	/* wait for all the barriers */
@@ -3620,16 +3602,6 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
 		}
 	}
 
-	/*
-	 * Try hard in case of flush. Lets say, in RAID1 we have
-	 * the following situation
-	 *  dev1: EIO dev2: ENOMEM
-	 * this is not a fatal error as we hope to recover from
-	 * ENOMEM in the next attempt to flush.
-	 * But the following is considered as fatal
-	 *  dev1: ENOMEM dev2: ENOMEM
-	 *  dev1: bdev == NULL dev2: ENOMEM
-	 */
 	if (errors_send || errors_wait) {
 		/*
 		 * At some point we need the status of all disks
-- 
cgit v1.2.3


From cea7c8bf77209ba11350e9c6f541064c820a174c Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Tue, 13 Jun 2017 17:05:40 +0800
Subject: btrfs: remove redundant null bdev counting during flush submission

There is no extra benefit to count null bdev during the submit loop,
as these null devices will be anyway checked during command
completion device loop just after the submit loop. We are holding the
device_list_mutex, the device->bdev status won't change in between.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'fs/btrfs/disk-io.c')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 1e90469cc0d2..528eef955060 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3564,7 +3564,6 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
 {
 	struct list_head *head;
 	struct btrfs_device *dev;
-	int errors_send = 0;
 	int errors_wait = 0;
 	int ret;
 
@@ -3573,10 +3572,8 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
 	list_for_each_entry_rcu(dev, head, dev_list) {
 		if (dev->missing)
 			continue;
-		if (!dev->bdev) {
-			errors_send++;
+		if (!dev->bdev)
 			continue;
-		}
 		if (!dev->in_fs_metadata || !dev->writeable)
 			continue;
 
@@ -3602,7 +3599,7 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
 		}
 	}
 
-	if (errors_send || errors_wait) {
+	if (errors_wait) {
 		/*
 		 * At some point we need the status of all disks
 		 * to arrive at the volume status. So error checking
-- 
cgit v1.2.3


From 4fc6441aac75893b4f32415f693d001f290d7d5b Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Tue, 13 Jun 2017 17:05:41 +0800
Subject: btrfs: wait part of the write_dev_flush() can be separated out

Submit and wait parts of write_dev_flush() can be split into two
separate functions for better readability.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c | 59 ++++++++++++++++++++++++++++--------------------------
 1 file changed, 31 insertions(+), 28 deletions(-)

(limited to 'fs/btrfs/disk-io.c')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 528eef955060..2b00ebff13f8 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3488,37 +3488,16 @@ static void btrfs_end_empty_barrier(struct bio *bio)
 }
 
 /*
- * trigger flushes for one the devices.  If you pass wait == 0, the flushes are
- * sent down.  With wait == 1, it waits for the previous flush.
- *
- * any device where the flush fails with eopnotsupp are flagged as not-barrier
- * capable
+ * Submit a flush request to the device if it supports it. Error handling is
+ * done in the waiting counterpart.
  */
-static int write_dev_flush(struct btrfs_device *device, int wait)
+static void write_dev_flush(struct btrfs_device *device)
 {
 	struct request_queue *q = bdev_get_queue(device->bdev);
 	struct bio *bio;
-	int ret = 0;
 
 	if (!test_bit(QUEUE_FLAG_WC, &q->queue_flags))
-		return 0;
-
-	if (wait) {
-		bio = device->flush_bio;
-		wait_for_completion(&device->flush_wait);
-
-		if (bio->bi_error) {
-			ret = bio->bi_error;
-			btrfs_dev_stat_inc_and_print(device,
-				BTRFS_DEV_STAT_FLUSH_ERRS);
-		}
-
-		/* drop the reference from the wait == 0 run */
-		bio_put(bio);
-		device->flush_bio = NULL;
-
-		return ret;
-	}
+		return;
 
 	/*
 	 * one reference for us, and we leave it for the
@@ -3535,8 +3514,32 @@ static int write_dev_flush(struct btrfs_device *device, int wait)
 
 	bio_get(bio);
 	btrfsic_submit_bio(bio);
+}
 
-	return 0;
+/*
+ * If the flush bio has been submitted by write_dev_flush, wait for it.
+ */
+static int wait_dev_flush(struct btrfs_device *device)
+{
+	int ret = 0;
+	struct bio *bio = device->flush_bio;
+
+	if (!bio)
+		return 0;
+
+	wait_for_completion(&device->flush_wait);
+
+	if (bio->bi_error) {
+		ret = bio->bi_error;
+		btrfs_dev_stat_inc_and_print(device,
+				BTRFS_DEV_STAT_FLUSH_ERRS);
+	}
+
+	/* drop the reference from the wait == 0 run */
+	bio_put(bio);
+	device->flush_bio = NULL;
+
+	return ret;
 }
 
 static int check_barrier_error(struct btrfs_fs_devices *fsdevs)
@@ -3577,7 +3580,7 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
 		if (!dev->in_fs_metadata || !dev->writeable)
 			continue;
 
-		write_dev_flush(dev, 0);
+		write_dev_flush(dev);
 		dev->last_flush_error = 0;
 	}
 
@@ -3592,7 +3595,7 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
 		if (!dev->in_fs_metadata || !dev->writeable)
 			continue;
 
-		ret = write_dev_flush(dev, 1);
+		ret = wait_dev_flush(dev);
 		if (ret) {
 			dev->last_flush_error = ret;
 			errors_wait++;
-- 
cgit v1.2.3


From fac03c8daeb581e2bc38e5a8c0c6a42cf87cf1c3 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 15 Jun 2017 19:10:03 +0200
Subject: btrfs: move fs_info::fs_frozen to the flags

We can keep the state among the other fs_info flags, there's no reason
why fs_frozen would need to be separate.

Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h       | 5 ++---
 fs/btrfs/disk-io.c     | 1 -
 fs/btrfs/super.c       | 6 ++++--
 fs/btrfs/transaction.c | 3 ++-
 4 files changed, 8 insertions(+), 7 deletions(-)

(limited to 'fs/btrfs/disk-io.c')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index f0f5f28784b6..6375e57a5a69 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -716,6 +716,8 @@ struct btrfs_delayed_root;
 #define BTRFS_FS_LOG1_ERR			12
 #define BTRFS_FS_LOG2_ERR			13
 #define BTRFS_FS_QUOTA_OVERRIDE			14
+/* Used to record internally whether fs has been frozen */
+#define BTRFS_FS_FROZEN				15
 
 /*
  * Indicate that a whole-filesystem exclusive operation is running
@@ -1107,9 +1109,6 @@ struct btrfs_fs_info {
 	 */
 	struct list_head pinned_chunks;
 
-	/* Used to record internally whether fs has been frozen */
-	int fs_frozen;
-
 	/* Cached block sizes */
 	u32 nodesize;
 	u32 sectorsize;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 2b00ebff13f8..2ac0a35f4450 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2661,7 +2661,6 @@ int open_ctree(struct super_block *sb,
 	atomic_set(&fs_info->qgroup_op_seq, 0);
 	atomic_set(&fs_info->reada_works_cnt, 0);
 	atomic64_set(&fs_info->tree_mod_seq, 0);
-	fs_info->fs_frozen = 0;
 	fs_info->sb = sb;
 	fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE;
 	fs_info->metadata_ratio = 0;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 53d43cd3cace..2100be6ae68e 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -2242,7 +2242,7 @@ static int btrfs_freeze(struct super_block *sb)
 	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
 	struct btrfs_root *root = fs_info->tree_root;
 
-	fs_info->fs_frozen = 1;
+	set_bit(BTRFS_FS_FROZEN, &fs_info->flags);
 	/*
 	 * We don't need a barrier here, we'll wait for any transaction that
 	 * could be in progress on other threads (and do delayed iputs that
@@ -2261,7 +2261,9 @@ static int btrfs_freeze(struct super_block *sb)
 
 static int btrfs_unfreeze(struct super_block *sb)
 {
-	btrfs_sb(sb)->fs_frozen = 0;
+	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+
+	clear_bit(BTRFS_FS_FROZEN, &fs_info->flags);
 	return 0;
 }
 
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index ab030fb22530..97e33513b195 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -2314,7 +2314,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
 	 * it'll result in deadlock about SB_FREEZE_FS.
 	 */
 	if (current != fs_info->transaction_kthread &&
-	    current != fs_info->cleaner_kthread && !fs_info->fs_frozen)
+	    current != fs_info->cleaner_kthread &&
+	    !test_bit(BTRFS_FS_FROZEN, &fs_info->flags))
 		btrfs_run_delayed_iputs(fs_info);
 
 	return ret;
-- 
cgit v1.2.3


From e0ae999414238aa9c0a116844813982effb68a02 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Tue, 6 Jun 2017 17:06:06 +0200
Subject: btrfs: preallocate device flush bio

For devices that support flushing, we allocate a bio, submit, wait for
it and then free it. The bio allocation does not fail so ENOMEM is not a
problem but we still may unnecessarily stress the allocation subsystem.

Instead, we can allocate the bio at the same time we allocate the device
and reuse it each time we need to flush the barriers. The bio is reset
before each use. Reference counting is simplified to just device
allocation (get) and freeing (put).

The bio used to be submitted through the integrity checker which will
find out that bio has no data attached and call submit_bio.

Status of the bio in flight needs to be tracked separately in case the
device caches get switched off between write and wait.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c | 25 +++++++------------------
 fs/btrfs/volumes.c | 12 ++++++++++++
 fs/btrfs/volumes.h |  1 +
 3 files changed, 20 insertions(+), 18 deletions(-)

(limited to 'fs/btrfs/disk-io.c')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 2ac0a35f4450..bfcbab3a7607 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3481,9 +3481,7 @@ static int write_dev_supers(struct btrfs_device *device,
  */
 static void btrfs_end_empty_barrier(struct bio *bio)
 {
-	if (bio->bi_private)
-		complete(bio->bi_private);
-	bio_put(bio);
+	complete(bio->bi_private);
 }
 
 /*
@@ -3493,26 +3491,20 @@ static void btrfs_end_empty_barrier(struct bio *bio)
 static void write_dev_flush(struct btrfs_device *device)
 {
 	struct request_queue *q = bdev_get_queue(device->bdev);
-	struct bio *bio;
+	struct bio *bio = device->flush_bio;
 
 	if (!test_bit(QUEUE_FLAG_WC, &q->queue_flags))
 		return;
 
-	/*
-	 * one reference for us, and we leave it for the
-	 * caller
-	 */
-	device->flush_bio = NULL;
-	bio = btrfs_io_bio_alloc(0);
+	bio_reset(bio);
 	bio->bi_end_io = btrfs_end_empty_barrier;
 	bio->bi_bdev = device->bdev;
 	bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH;
 	init_completion(&device->flush_wait);
 	bio->bi_private = &device->flush_wait;
-	device->flush_bio = bio;
 
-	bio_get(bio);
-	btrfsic_submit_bio(bio);
+	submit_bio(bio);
+	device->flush_bio_sent = 1;
 }
 
 /*
@@ -3523,9 +3515,10 @@ static int wait_dev_flush(struct btrfs_device *device)
 	int ret = 0;
 	struct bio *bio = device->flush_bio;
 
-	if (!bio)
+	if (!device->flush_bio_sent)
 		return 0;
 
+	device->flush_bio_sent = 0;
 	wait_for_completion(&device->flush_wait);
 
 	if (bio->bi_error) {
@@ -3534,10 +3527,6 @@ static int wait_dev_flush(struct btrfs_device *device)
 				BTRFS_DEV_STAT_FLUSH_ERRS);
 	}
 
-	/* drop the reference from the wait == 0 run */
-	bio_put(bio);
-	device->flush_bio = NULL;
-
 	return ret;
 }
 
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 2090245e8f06..c95f018d4a1e 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -242,6 +242,17 @@ static struct btrfs_device *__alloc_device(void)
 	if (!dev)
 		return ERR_PTR(-ENOMEM);
 
+	/*
+	 * Preallocate a bio that's always going to be used for flushing device
+	 * barriers and matches the device lifespan
+	 */
+	dev->flush_bio = bio_alloc_bioset(GFP_KERNEL, 0, NULL);
+	if (!dev->flush_bio) {
+		kfree(dev);
+		return ERR_PTR(-ENOMEM);
+	}
+	bio_get(dev->flush_bio);
+
 	INIT_LIST_HEAD(&dev->dev_list);
 	INIT_LIST_HEAD(&dev->dev_alloc_list);
 	INIT_LIST_HEAD(&dev->resized_list);
@@ -838,6 +849,7 @@ static void __free_device(struct work_struct *work)
 
 	device = container_of(work, struct btrfs_device, rcu_work);
 	rcu_string_free(device->name);
+	bio_put(device->flush_bio);
 	kfree(device);
 }
 
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 35327efecdbb..6f45fd60d15a 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -75,6 +75,7 @@ struct btrfs_device {
 	int can_discard;
 	int is_tgtdev_for_dev_replace;
 	int last_flush_error;
+	int flush_bio_sent;
 
 #ifdef __BTRFS_NEED_DEVICE_DATA_ORDERED
 	seqcount_t data_seqcount;
-- 
cgit v1.2.3


From 2980d5745fa0beeaab4c22e25bea4faa54f7d9f7 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 15 Jun 2017 16:04:26 +0200
Subject: btrfs: account as waiting for IO, while waiting fot the flush bio
 completion

Similar to what submit_bio_wait does, we should account for IO while
waiting for a bio completion. This has marginal visible effects, flush
bio is short-lived.

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/btrfs/disk-io.c')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index bfcbab3a7607..917e29167580 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3519,7 +3519,7 @@ static int wait_dev_flush(struct btrfs_device *device)
 		return 0;
 
 	device->flush_bio_sent = 0;
-	wait_for_completion(&device->flush_wait);
+	wait_for_completion_io(&device->flush_wait);
 
 	if (bio->bi_error) {
 		ret = bio->bi_error;
-- 
cgit v1.2.3


From 66b4993e95c17e0ee30f0d72f0d6e00e5c035a98 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 15 Jun 2017 16:20:43 +0200
Subject: btrfs: move dev stats accounting out of wait_dev_flush

We should really just wait in wait_dev_flush and let the caller decide
what to do with the error value.

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

(limited to 'fs/btrfs/disk-io.c')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 917e29167580..b6758892874f 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3512,7 +3512,6 @@ static void write_dev_flush(struct btrfs_device *device)
  */
 static int wait_dev_flush(struct btrfs_device *device)
 {
-	int ret = 0;
 	struct bio *bio = device->flush_bio;
 
 	if (!device->flush_bio_sent)
@@ -3521,13 +3520,7 @@ static int wait_dev_flush(struct btrfs_device *device)
 	device->flush_bio_sent = 0;
 	wait_for_completion_io(&device->flush_wait);
 
-	if (bio->bi_error) {
-		ret = bio->bi_error;
-		btrfs_dev_stat_inc_and_print(device,
-				BTRFS_DEV_STAT_FLUSH_ERRS);
-	}
-
-	return ret;
+	return bio->bi_error;
 }
 
 static int check_barrier_error(struct btrfs_fs_devices *fsdevs)
@@ -3586,6 +3579,8 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
 		ret = wait_dev_flush(dev);
 		if (ret) {
 			dev->last_flush_error = ret;
+			btrfs_dev_stat_inc_and_print(dev,
+					BTRFS_DEV_STAT_FLUSH_ERRS);
 			errors_wait++;
 		}
 	}
-- 
cgit v1.2.3