mm: pagecache add lock

Add a per address space lock around adding pages to the pagecache - making it possible for fallocate INSERT_RANGE/COLLAPSE_RANGE to work correctly, and also hopefully making truncate and dio a bit saner.
author: Kent Overstreet <kent.overstreet@gmail.com> 2016-03-28 19:38:20 -0800
committer: Kent Overstreet <kent.overstreet@gmail.com> 2020-05-06 17:14:14 -0400
commit: 2040ab403ba11900f19596fa16427624e5b8721d (patch)
tree: e7990c2237a33cbe812039ee3c7ed67d57e4fa01
parent: b3785d8f7feca002a2ebaf287acecb27c34fc61a (diff)
5 files changed, 119 insertions, 10 deletions
diff --git a/fs/inode.c b/fs/inode.c
index c9eb5041ffae..2c7d26015b3b 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -351,6 +351,7 @@ EXPORT_SYMBOL(inc_nlink);
 static void __address_space_init_once(struct address_space *mapping)
 {
 	INIT_RADIX_TREE(&mapping->i_pages, GFP_ATOMIC | __GFP_ACCOUNT);
+	pagecache_lock_init(&mapping->add_lock);
 	init_rwsem(&mapping->i_mmap_rwsem);
 	INIT_LIST_HEAD(&mapping->private_list);
 	spin_lock_init(&mapping->private_lock);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 15b8e02880c3..da0ca2d972d9 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -412,6 +412,28 @@ int pagecache_write_end(struct file *, struct address_space *mapping,
 				loff_t pos, unsigned len, unsigned copied,
 				struct page *page, void *fsdata);
 
+/*
+ * Two-state lock - can be taken for add or block - both states are shared,
+ * like read side of rwsem, but conflict with other state:
+ */
+struct pagecache_lock {
+	atomic_long_t		v;
+	wait_queue_head_t	wait;
+};
+
+static inline void pagecache_lock_init(struct pagecache_lock *lock)
+{
+	atomic_long_set(&lock->v, 0);
+	init_waitqueue_head(&lock->wait);
+}
+
+void pagecache_add_put(struct pagecache_lock *);
+void pagecache_add_get(struct pagecache_lock *);
+void __pagecache_block_put(struct pagecache_lock *);
+void __pagecache_block_get(struct pagecache_lock *);
+void pagecache_block_put(struct pagecache_lock *);
+void pagecache_block_get(struct pagecache_lock *);
+
 struct address_space {
 	struct inode		*host;		/* owner: inode, block_device */
 	struct radix_tree_root	i_pages;	/* cached pages */
@@ -430,6 +452,8 @@ struct address_space {
 	struct list_head	private_list;	/* for use by the address_space */
 	void			*private_data;	/* ditto */
 	errseq_t		wb_err;
+	struct pagecache_lock	add_lock
+		____cacheline_aligned_in_smp;	/* protects adding new pages */
 } __attribute__((aligned(sizeof(long)))) __randomize_layout;
 	/*
 	 * On most architectures that alignment is already the case; but
diff --git a/include/linux/sched.h b/include/linux/sched.h
index c69f308f3a53..80629ad3d05a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -41,6 +41,7 @@ struct io_context;
 struct mempolicy;
 struct nameidata;
 struct nsproxy;
+struct pagecache_lock;
 struct perf_event_context;
 struct pid_namespace;
 struct pipe_inode_info;
@@ -940,6 +941,9 @@ struct task_struct {
 	unsigned int			in_ubsan;
 #endif
 
+	/* currently held lock, for avoiding recursing in fault path: */
+	struct pagecache_lock *pagecache_lock;
+
 	/* Journalling filesystem info: */
 	void				*journal_info;
 
diff --git a/init/init_task.c b/init/init_task.c
index 5aebe3be4d7c..f4be590da394 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -114,6 +114,7 @@ struct task_struct init_task
 	},
 	.blocked	= {{0}},
 	.alloc_lock	= __SPIN_LOCK_UNLOCKED(init_task.alloc_lock),
+	.pagecache_lock = NULL,
 	.journal_info	= NULL,
 	INIT_CPU_TIMERS(init_task)
 	.pi_lock	= __RAW_SPIN_LOCK_UNLOCKED(init_task.pi_lock),
diff --git a/mm/filemap.c b/mm/filemap.c
index 45f1c6d73b5b..b147e55a8d67 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -111,6 +111,73 @@
  *   ->tasklist_lock            (memory_failure, collect_procs_ao)
  */
 
+static void __pagecache_lock_put(struct pagecache_lock *lock, long i)
+{
+	BUG_ON(atomic_long_read(&lock->v) == 0);
+
+	if (atomic_long_sub_return_release(i, &lock->v) == 0)
+		wake_up_all(&lock->wait);
+}
+
+static bool __pagecache_lock_tryget(struct pagecache_lock *lock, long i)
+{
+	long v = atomic_long_read(&lock->v), old;
+
+	do {
+		old = v;
+
+		if (i > 0 ? v < 0 : v > 0)
+			return false;
+	} while ((v = atomic_long_cmpxchg_acquire(&lock->v,
+					old, old + i)) != old);
+	return true;
+}
+
+static void __pagecache_lock_get(struct pagecache_lock *lock, long i)
+{
+	wait_event(lock->wait, __pagecache_lock_tryget(lock, i));
+}
+
+void pagecache_add_put(struct pagecache_lock *lock)
+{
+	__pagecache_lock_put(lock, 1);
+}
+EXPORT_SYMBOL(pagecache_add_put);
+
+void pagecache_add_get(struct pagecache_lock *lock)
+{
+	__pagecache_lock_get(lock, 1);
+}
+EXPORT_SYMBOL(pagecache_add_get);
+
+void __pagecache_block_put(struct pagecache_lock *lock)
+{
+	__pagecache_lock_put(lock, -1);
+}
+EXPORT_SYMBOL(__pagecache_block_put);
+
+void __pagecache_block_get(struct pagecache_lock *lock)
+{
+	__pagecache_lock_get(lock, -1);
+}
+EXPORT_SYMBOL(__pagecache_block_get);
+
+void pagecache_block_put(struct pagecache_lock *lock)
+{
+	BUG_ON(current->pagecache_lock != lock);
+	current->pagecache_lock = NULL;
+	__pagecache_lock_put(lock, -1);
+}
+EXPORT_SYMBOL(pagecache_block_put);
+
+void pagecache_block_get(struct pagecache_lock *lock)
+{
+	__pagecache_lock_get(lock, -1);
+	BUG_ON(current->pagecache_lock);
+	current->pagecache_lock = lock;
+}
+EXPORT_SYMBOL(pagecache_block_get);
+
 static int page_cache_tree_insert(struct address_space *mapping,
 				  struct page *page, void **shadowp)
 {
@@ -858,19 +925,19 @@ static int __add_to_page_cache_locked(struct page *page,
 	VM_BUG_ON_PAGE(!PageLocked(page), page);
 	VM_BUG_ON_PAGE(PageSwapBacked(page), page);
 
+	if (current->pagecache_lock != &mapping->add_lock)
+		pagecache_add_get(&mapping->add_lock);
+
 	if (!huge) {
 		error = mem_cgroup_try_charge(page, current->mm,
 					      gfp_mask, &memcg, false);
 		if (error)
-			return error;
+			goto err;
 	}
 
 	error = radix_tree_maybe_preload(gfp_mask & GFP_RECLAIM_MASK);
-	if (error) {
-		if (!huge)
-			mem_cgroup_cancel_charge(page, memcg, false);
-		return error;
-	}
+	if (error)
+		goto err_uncharge;
 
 	get_page(page);
 	page->mapping = mapping;
@@ -889,15 +956,20 @@ static int __add_to_page_cache_locked(struct page *page,
 	if (!huge)
 		mem_cgroup_commit_charge(page, memcg, false, false);
 	trace_mm_filemap_add_to_page_cache(page);
-	return 0;
+err:
+	if (current->pagecache_lock != &mapping->add_lock)
+		pagecache_add_put(&mapping->add_lock);
+
+	return error;
 err_insert:
 	page->mapping = NULL;
 	/* Leave page->index set: truncation relies upon it */
 	xa_unlock_irq(&mapping->i_pages);
+	put_page(page);
+err_uncharge:
 	if (!huge)
 		mem_cgroup_cancel_charge(page, memcg, false);
-	put_page(page);
-	return error;
+	goto err;
 }
 
 /**
@@ -2532,7 +2604,14 @@ vm_fault_t filemap_fault(struct vm_fault *vmf)
 	 * Do we have something in the page cache already?
 	 */
 	page = find_get_page(mapping, offset);
-	if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) {
+	if (unlikely(current->pagecache_lock == &mapping->add_lock)) {
+		/*
+		 * fault from e.g. dio -> get_user_pages() - _don't_ want to do
+		 * readahead, only read in page we need:
+		 */
+		if (!page)
+			goto no_cached_page;
+	} else if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) {
 		/*
 		 * We found the page, so try async readahead before
 		 * waiting for the lock.
author	Kent Overstreet <kent.overstreet@gmail.com>	2016-03-28 19:38:20 -0800
committer	Kent Overstreet <kent.overstreet@gmail.com>	2020-05-06 17:14:14 -0400
commit	2040ab403ba11900f19596fa16427624e5b8721d (patch)
tree	e7990c2237a33cbe812039ee3c7ed67d57e4fa01
parent	b3785d8f7feca002a2ebaf287acecb27c34fc61a (diff)