xfs: map xfile pages directly into xfs_buf

Map the xfile pages directly into xfs_buf to reduce memory overhead. It's silly to use memory to stage changes to shmem pages for ephemeral btrees that don't care about transactionality. Signed-off-by: Darrick J. Wong <djwong@kernel.org>
author: Darrick J. Wong <djwong@kernel.org> 2021-10-05 10:47:36 -0700
committer: Darrick J. Wong <djwong@kernel.org> 2021-10-22 16:41:17 -0700
commit: 04e01af9318f50df5e1974d8e2ce7d54b6cfbe55 (patch)
tree: c9ecd431013084675617dda8097c2a07ca3f3156
parent: 058725b83f3ab866a225aec3bfa204c0b7f4d3a8 (diff)
10 files changed, 327 insertions, 9 deletions
diff --git a/fs/xfs/libxfs/xfs_btree_mem.h b/fs/xfs/libxfs/xfs_btree_mem.h
index ddeb05ddba4a..91cf23ee1c44 100644
--- a/fs/xfs/libxfs/xfs_btree_mem.h
+++ b/fs/xfs/libxfs/xfs_btree_mem.h
@@ -88,5 +88,7 @@ xfbtree_free_block(struct xfs_btree_cur *cur, struct xfs_buf *bp)
 
 /* btree has long pointers */
 #define XFBTREE_CREATE_LONG_PTRS	(1U << 0)
+/* buffers should be directly mapped from memory */
+#define XFBTREE_DIRECT_MAP		(1U << 1)
 
 #endif /* __XFS_BTREE_MEM_H__ */
diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c
index 0465da82e11f..d575295fbff9 100644
--- a/fs/xfs/libxfs/xfs_rmap_btree.c
+++ b/fs/xfs/libxfs/xfs_rmap_btree.c
@@ -551,8 +551,8 @@ xfs_rmapbt_mem_create(
 	struct xfs_mount	*mp,
 	const char		*name)
 {
-	return xfbtree_create(mp, XFS_BTNUM_RMAP, &xfs_rmapbt_buf_ops, 0,
-			name);
+	return xfbtree_create(mp, XFS_BTNUM_RMAP, &xfs_rmapbt_buf_ops,
+			XFBTREE_DIRECT_MAP, name);
 }
 
 /*
diff --git a/fs/xfs/libxfs/xfs_rtrmap_btree.c b/fs/xfs/libxfs/xfs_rtrmap_btree.c
index 29de9bfbde20..6bbabf12b0c9 100644
--- a/fs/xfs/libxfs/xfs_rtrmap_btree.c
+++ b/fs/xfs/libxfs/xfs_rtrmap_btree.c
@@ -566,7 +566,7 @@ xfs_rtrmapbt_mem_create(
 	const char		*name)
 {
 	return xfbtree_create(mp, XFS_BTNUM_RTRMAP, &xfs_rtrmapbt_buf_ops,
-			XFBTREE_CREATE_LONG_PTRS, name);
+			XFBTREE_CREATE_LONG_PTRS | XFBTREE_DIRECT_MAP, name);
 }
 
 /*
diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
index 25fa1eda4162..1342dd3653fc 100644
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -921,6 +921,8 @@ DEFINE_XFILE_EVENT(xfile_pwrite);
 DEFINE_XFILE_EVENT(xfile_seek_data);
 DEFINE_XFILE_EVENT(xfile_discard);
 DEFINE_XFILE_EVENT(xfile_prealloc);
+DEFINE_XFILE_EVENT(xfile_obj_get_page);
+DEFINE_XFILE_EVENT(xfile_obj_put_page);
 
 TRACE_EVENT(xfarray_sort_stats,
 	TP_PROTO(struct xfarray *xfa, unsigned int max_stack_depth,
diff --git a/fs/xfs/scrub/xfbtree.c b/fs/xfs/scrub/xfbtree.c
index c43c18136733..1098234eec3b 100644
--- a/fs/xfs/scrub/xfbtree.c
+++ b/fs/xfs/scrub/xfbtree.c
@@ -151,8 +151,9 @@ xfs_btree_mem_head_read_buf(
 {
 	struct xfs_mount		*mp = btp->bt_mount;
 
-	return xfs_trans_read_buf(mp, tp, btp, XFS_BTREE_MEM_HEAD_DADDR, 1, 0,
-			bpp, &xfs_btree_mem_head_buf_ops);
+	return xfs_trans_read_buf(mp, tp, btp, XFS_BTREE_MEM_HEAD_DADDR,
+			XFS_FSB_TO_BB(mp, 1), 0, bpp,
+			&xfs_btree_mem_head_buf_ops);
 }
 
 /* Return tree height from the in-memory btree head */
@@ -239,6 +240,9 @@ xfbtree_create(
 		goto err_xfile;
 	}
 
+	if (mp->m_bsize == PAGE_SIZE && (flags & XFBTREE_DIRECT_MAP))
+		xfbt->target->bt_flags |= XFS_BUFTARG_DIRECT_MAP;
+
 	xfbt->freespace = kmem_alloc(sizeof(struct xbitmap),
 			KM_NOFS | KM_MAYFAIL);
 	if (!xfbt->freespace) {
@@ -264,7 +268,8 @@ xfbtree_create(
 		goto err_freesp;
 
 	/* Initialize the in-memory btree header block. */
-	error = xfs_buf_get(xfbt->target, XFS_BTREE_MEM_HEAD_DADDR, 1, &bp);
+	error = xfs_buf_get(xfbt->target, XFS_BTREE_MEM_HEAD_DADDR,
+			XFS_FSB_TO_BB(mp, 1), &bp);
 	if (error)
 		goto err_freesp;
 
diff --git a/fs/xfs/scrub/xfile.c b/fs/xfs/scrub/xfile.c
index 310e96f3f1d4..764f09984c06 100644
--- a/fs/xfs/scrub/xfile.c
+++ b/fs/xfs/scrub/xfile.c
@@ -496,3 +496,114 @@ next_pgoff:
 out:
 	return error;
 }
+
+/*
+ * Grab the (locked) page for a memory object.  The object cannot span a page
+ * boundary.  Returns 0 (and a locked page) if successful, -ENOTBLK if we
+ * cannot grab the page, or the usual negative errno.
+ */
+int
+xfile_obj_get_page(
+	struct xfile		*xf,
+	loff_t			pos,
+	unsigned int		len,
+	struct page		**pagep)
+{
+	struct inode		*inode = file_inode(xf->file);
+	struct address_space	*mapping = inode->i_mapping;
+	struct page		*page = NULL;
+	void			*fsdata = NULL;
+	unsigned int		pflags;
+	int			error;
+
+	if (inode->i_sb->s_maxbytes - pos < len)
+		return -ENOMEM;
+	if (len > PAGE_SIZE - offset_in_page(pos))
+		return -ENOTBLK;
+
+	trace_xfile_obj_get_page(xf, pos, len);
+
+	pflags = memalloc_nofs_save();
+
+	/*
+	 * We call pagecache_write_begin directly here to avoid all the freezer
+	 * protection lock-taking that happens in the normal path.  shmem
+	 * doesn't support fs freeze, but lockdep doesn't know that and will
+	 * trip over that.
+	 */
+	error = pagecache_write_begin(NULL, mapping, pos, len, AOP_FLAG_NOFS,
+			&page, &fsdata);
+	if (error)
+		goto out_pflags;
+
+	/*
+	 * We don't support passing fsdata to the caller and back to
+	 * xfile_put_page, so if we get a non-null pointer we just bail out.
+	 */
+	if (fsdata != NULL || PageHWPoison(page)) {
+		int	ret;
+
+		ASSERT(fsdata != NULL);
+		ret = pagecache_write_end(NULL, mapping, pos, len, 0, page,
+				fsdata);
+		if (ret < 0)
+			error = ret;
+		else
+			error = -ENOTBLK;
+		goto out_pflags;
+	}
+
+	/* We got the page, so make sure we push out EOF. */
+	if (i_size_read(inode) < pos + len)
+		i_size_write(inode, pos + len);
+
+	/*
+	 * If the page isn't up to date, fill it with zeroes before we hand it
+	 * to the caller and make sure the backing store will hold on to them.
+	 */
+	if (!PageUptodate(page)) {
+		void	*kaddr;
+
+		kaddr = kmap_local_page(page);
+		memset(kaddr, 0, PAGE_SIZE);
+		kunmap_local(kaddr);
+		SetPageUptodate(page);
+		set_page_dirty(page);
+	}
+
+	*pagep = page;
+out_pflags:
+	memalloc_nofs_restore(pflags);
+	return error;
+}
+
+/*
+ * Release the (locked) page for a memory object.  The page must have been
+ * obtained by xfile_obj_get_page.  Returns 0 or a negative errno.
+ */
+int
+xfile_obj_put_page(
+	struct xfile		*xf,
+	loff_t			pos,
+	unsigned int		len,
+	struct page		*page)
+{
+	struct inode		*inode = file_inode(xf->file);
+	struct address_space	*mapping = inode->i_mapping;
+	unsigned int		pflags;
+	int			ret;
+
+	ASSERT(len <= PAGE_SIZE - offset_in_page(pos));
+
+	trace_xfile_obj_put_page(xf, pos, len);
+
+	pflags = memalloc_nofs_save();
+	ret = pagecache_write_end(NULL, mapping, pos, len, len, page, NULL);
+	memalloc_nofs_restore(pflags);
+
+	if (ret < 0)
+		return ret;
+	if (ret != len)
+		return -EIO;
+	return 0;
+}
diff --git a/fs/xfs/scrub/xfile.h b/fs/xfs/scrub/xfile.h
index c6f6f56b4d0f..e63d61a380fe 100644
--- a/fs/xfs/scrub/xfile.h
+++ b/fs/xfs/scrub/xfile.h
@@ -57,6 +57,10 @@ struct xfile_stat {
 int xfile_stat(struct xfile *xf, struct xfile_stat *statbuf);
 
 int xfile_dump(struct xfile *xf);
+int xfile_obj_get_page(struct xfile *xf, loff_t offset, unsigned int len,
+		struct page **pagep);
+int xfile_obj_put_page(struct xfile *xf, loff_t offset, unsigned int len,
+		struct page *page);
 #else
 static inline int
 xfile_obj_load(struct xfile *xf, void *buf, size_t count, loff_t offset)
@@ -69,6 +73,24 @@ xfile_obj_store(struct xfile *xf, void *buf, size_t count, loff_t offset)
 {
 	return -EIO;
 }
+static inline int
+xfile_obj_get_page(
+	struct xfile	*xf,
+	loff_t		offset,
+	unsigned int	len,
+	struct page	**pagep)
+{
+	return -EIO;
+}
+static inline int
+xfile_obj_put_page(
+	struct xfile	*xf,
+	loff_t		offset,
+	unsigned int	len,
+	struct page	*page)
+{
+	return -EIO;
+}
 #endif /* CONFIG_XFS_ONLINE_SCRUB */
 
 #endif /* __XFS_SCRUB_XFILE_H__ */
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index b8114a4d9761..5d1eebee2555 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -271,6 +271,45 @@ _xfs_buf_alloc(
 }
 
 static void
+xfs_buf_free_direct_pages(
+	struct xfs_buf		*bp)
+{
+	struct xfs_buf_map	*map;
+	unsigned int		m, p, n;
+	int			error = 0, err2;
+
+	ASSERT(bp->b_target->bt_flags & XFS_BUFTARG_DIRECT_MAP);
+
+	if (xfs_buf_is_vmapped(bp))
+		vm_unmap_ram(bp->b_addr, bp->b_page_count);
+
+	for (m = 0, p = 0, map = bp->b_maps; m < bp->b_map_count; m++, map++) {
+		for (n = 0; n < map->bm_len; n += BTOBB(PAGE_SIZE)) {
+			struct page	*page = bp->b_pages[p];
+			unsigned int	len;
+
+			len = min_t(unsigned int, BBTOB(map->bm_len - n),
+					PAGE_SIZE);
+
+			lock_page(page);
+			err2 = xfile_obj_put_page(bp->b_target->bt_xfile,
+					BBTOB(map->bm_bn + n), len, page);
+			if (!error && err2)
+				error = err2;
+			bp->b_pages[p++] = NULL;
+		}
+	}
+
+	if (error)
+		xfs_err(bp->b_mount, "%s failed errno %d", __func__, error);
+
+	if (bp->b_pages != bp->b_page_array)
+		kmem_free(bp->b_pages);
+	bp->b_pages = NULL;
+	bp->b_flags &= ~_XBF_DIRECT_MAP;
+}
+
+static void
 xfs_buf_free_pages(
 	struct xfs_buf	*bp)
 {
@@ -302,7 +341,9 @@ xfs_buf_free(
 
 	ASSERT(list_empty(&bp->b_lru));
 
-	if (bp->b_flags & _XBF_PAGES)
+	if (bp->b_flags & _XBF_DIRECT_MAP)
+		xfs_buf_free_direct_pages(bp);
+	else if (bp->b_flags & _XBF_PAGES)
 		xfs_buf_free_pages(bp);
 	else if (bp->b_flags & _XBF_KMEM)
 		kmem_free(bp->b_addr);
@@ -401,6 +442,93 @@ xfs_buf_alloc_pages(
 }
 
 /*
+ * Try to map storage directly, if the target supports it.  Returns 0 for
+ * success, -ENOTBLK to mean "not supported", or the usual negative errno.
+ */
+static int
+xfs_buf_alloc_direct_pages(
+	struct xfs_buf		*bp,
+	xfs_buf_flags_t		flags)
+{
+	struct xfs_buf_map	*map;
+	gfp_t			gfp_mask = __GFP_NOWARN;
+	const unsigned int	page_align_mask = PAGE_SIZE - 1;
+	unsigned int		m, p, n;
+	int			error;
+
+	ASSERT(bp->b_target->bt_flags & XFS_BUFTARG_IN_MEMORY);
+
+	/* For direct-map buffers, each map has to be page aligned. */
+	for (m = 0, map = bp->b_maps; m < bp->b_map_count; m++, map++)
+		if (BBTOB(map->bm_bn | map->bm_len) & page_align_mask)
+			return -ENOTBLK;
+
+	if (flags & XBF_READ_AHEAD)
+		gfp_mask |= __GFP_NORETRY;
+	else
+		gfp_mask |= GFP_NOFS;
+
+	/* Make sure that we have a page list */
+	bp->b_page_count = DIV_ROUND_UP(BBTOB(bp->b_length), PAGE_SIZE);
+	if (bp->b_page_count <= XB_PAGES) {
+		bp->b_pages = bp->b_page_array;
+	} else {
+		bp->b_pages = kzalloc(sizeof(struct page *) * bp->b_page_count,
+					gfp_mask);
+		if (!bp->b_pages)
+			return -ENOMEM;
+	}
+
+	/* Map in the xfile pages. */
+	for (m = 0, p = 0, map = bp->b_maps; m < bp->b_map_count; m++, map++) {
+		for (n = 0; n < map->bm_len; n += BTOBB(PAGE_SIZE)) {
+			unsigned int	len;
+
+			len = min_t(unsigned int, BBTOB(map->bm_len - n),
+					PAGE_SIZE);
+
+			error = xfile_obj_get_page(bp->b_target->bt_xfile,
+					BBTOB(map->bm_bn + n), len,
+					&bp->b_pages[p++]);
+			if (error)
+				goto fail;
+		}
+	}
+
+	/* Unlock all the pages now that we've grabbed them all. */
+	for (p = 0; p < bp->b_page_count; p++) {
+		ASSERT(PageUptodate(bp->b_pages[p]));
+		unlock_page(bp->b_pages[p]);
+	}
+
+	bp->b_flags |= _XBF_DIRECT_MAP;
+	return 0;
+
+fail:
+	for (m = 0, p = 0, map = bp->b_maps; m < bp->b_map_count; m++, map++) {
+		for (n = 0; n < map->bm_len; n += BTOBB(PAGE_SIZE)) {
+			struct page	*page = bp->b_pages[p++];
+			unsigned int	len;
+
+			if (!page)
+				continue;
+
+			len = min_t(unsigned int, BBTOB(map->bm_len - n),
+					PAGE_SIZE);
+
+			xfile_obj_put_page(bp->b_target->bt_xfile,
+					BBTOB(map->bm_bn + n), len, page);
+		}
+	}
+
+	if (bp->b_pages != bp->b_page_array)
+		kmem_free(bp->b_pages);
+	bp->b_pages = NULL;
+	bp->b_page_count = 0;
+	return error;
+}
+
+/*
  *	Map buffer into kernel address-space if necessary.
  */
 STATIC int
@@ -408,7 +536,8 @@ _xfs_buf_map_pages(
 	struct xfs_buf		*bp,
 	uint			flags)
 {
-	ASSERT(bp->b_flags & _XBF_PAGES);
+	ASSERT(bp->b_flags & (_XBF_PAGES | _XBF_DIRECT_MAP));
+
 	if (bp->b_page_count == 1) {
 		/* A single page buffer is always mappable */
 		bp->b_addr = page_address(bp->b_pages[0]);
@@ -625,7 +754,7 @@ found:
 	 */
 	if (bp->b_flags & XBF_STALE) {
 		ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0);
-		bp->b_flags &= _XBF_KMEM | _XBF_PAGES;
+		bp->b_flags &= _XBF_KMEM | _XBF_PAGES | _XBF_DIRECT_MAP;
 		bp->b_ops = NULL;
 	}
 
@@ -680,6 +809,13 @@ xfs_buf_get_map(
 	if (error)
 		return error;
 
+	/* Try to map pages directly, or fall back to memory. */
+	if (target->bt_flags & XFS_BUFTARG_DIRECT_MAP) {
+		error = xfs_buf_alloc_direct_pages(new_bp, flags);
+		if (error && error != -ENOTBLK)
+			goto out_free_buf;
+	}
+
 	/*
 	 * For buffers that fit entirely within a single page, first attempt to
 	 * allocate the memory from the heap to minimise memory usage. If we
@@ -1543,6 +1679,29 @@ xfs_buf_ioapply_in_memory(
 		xfs_buf_ioend(bp);
 }
 
+void
+xfs_buf_ioapply_direct_pages(
+	struct xfs_buf		*bp,
+	bool			is_write)
+{
+	unsigned int		i;
+	bool			ioerr = false;
+
+	for (i = 0; i < bp->b_page_count; i++) {
+		struct page	*page = bp->b_pages[i];
+
+		lock_page(page);
+		if (is_write)
+			set_page_dirty(page);
+		ioerr |= PageHWPoison(page);
+		unlock_page(page);
+	}
+
+	if (ioerr)
+		cmpxchg(&bp->b_io_error, 0, -EIO);
+}
+
+
 STATIC void
 _xfs_buf_ioapply(
 	struct xfs_buf	*bp)
@@ -1600,6 +1759,11 @@ _xfs_buf_ioapply(
 	/* we only use the buffer cache for meta-data */
 	op |= REQ_META;
 
+	if (bp->b_target->bt_flags & XFS_BUFTARG_DIRECT_MAP) {
+		xfs_buf_ioapply_direct_pages(bp, bp->b_flags & XBF_WRITE);
+		return;
+	}
+
 	if (bp->b_target->bt_flags & XFS_BUFTARG_IN_MEMORY) {
 		xfs_buf_ioapply_in_memory(bp);
 		return;
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 5328db3af1e3..30eafc6d3775 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -41,6 +41,7 @@ struct xfile;
 #define _XBF_PAGES	 (1 << 20)/* backed by refcounted pages */
 #define _XBF_KMEM	 (1 << 21)/* backed by heap memory */
 #define _XBF_DELWRI_Q	 (1 << 22)/* buffer on a delwri queue */
+#define _XBF_DIRECT_MAP	 (1 << 23)/* pages directly mapped to storage */
 
 /* flags used only as arguments to access routines */
 #define _XBF_IGNORE_STALE	(1 << 29)/* ignore stale buffers */
@@ -64,6 +65,7 @@ typedef unsigned int xfs_buf_flags_t;
 	{ _XBF_PAGES,		"PAGES" }, \
 	{ _XBF_KMEM,		"KMEM" }, \
 	{ _XBF_DELWRI_Q,	"DELWRI_Q" }, \
+	{ _XBF_DIRECT_MAP,	"DIRECT_MAP" }, \
 	/* The following interface flags should never be set */ \
 	{ _XBF_IGNORE_STALE,	"IGNORE_STALE" }, \
 	{ XBF_TRYLOCK,		"TRYLOCK" }, \
@@ -119,6 +121,8 @@ typedef struct xfs_buftarg {
 #define XFS_BUFTARG_SELF_CACHED	(1U << 0)
 /* in-memory buftarg */
 #define XFS_BUFTARG_IN_MEMORY	(1U << 1)
+/* buffer pages are direct-mapped (implies IN_MEMORY) */
+#define XFS_BUFTARG_DIRECT_MAP	(1U << 2)
 
 static inline bool
 xfs_buftarg_in_memory(
@@ -424,5 +428,6 @@ xfs_buftarg_zeroout(
 int xfs_buf_reverify(struct xfs_buf *bp, const struct xfs_buf_ops *ops);
 bool xfs_verify_magic(struct xfs_buf *bp, __be32 dmagic);
 bool xfs_verify_magic16(struct xfs_buf *bp, __be16 dmagic);
+void xfs_buf_ioapply_direct_pages(struct xfs_buf *bp, bool is_write);
 
 #endif	/* __XFS_BUF_H__ */
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 6549e50d852c..9cbd19531eb0 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -461,6 +461,13 @@ xfs_trans_dirty_buf(
 	ASSERT(atomic_read(&bip->bli_refcount) > 0);
 
 	/*
+	 * For buffers that are directly mapped to an in-memory file, mark the
+	 * pages dirty so that they'll be persisted properly.
+	 */
+	if (bp->b_target->bt_flags & XFS_BUFTARG_DIRECT_MAP)
+		xfs_buf_ioapply_direct_pages(bp, true);
+
+	/*
 	 * If we invalidated the buffer within this transaction, then
 	 * cancel the invalidation now that we're dirtying the buffer
 	 * again.  There are no races with the code in xfs_buf_item_unpin(),
author	Darrick J. Wong <djwong@kernel.org>	2021-10-05 10:47:36 -0700
committer	Darrick J. Wong <djwong@kernel.org>	2021-10-22 16:41:17 -0700
commit	04e01af9318f50df5e1974d8e2ce7d54b6cfbe55 (patch)
tree	c9ecd431013084675617dda8097c2a07ca3f3156
parent	058725b83f3ab866a225aec3bfa204c0b7f4d3a8 (diff)