xfs: create a big array data structure

Create a simple 'big array' data structure for storage of fixed-size metadata records that will be used to reconstruct a btree index. For repair operations, the most important operations are append, iterate, and sort. Earlier implementations of the big array used linked lists and suffered from severe problems -- pinning all records in kernel memory was not a good idea and frequently lead to OOM situations; random access was very inefficient; and record overhead for the lists was unacceptably high at 40-60%. Therefore, the big memory array relies on the 'xfile' abstraction, which creates a memfd file and stores the records in page cache pages. Since the memfd is created in tmpfs, the memory pages can be pushed out to disk if necessary and we have a built-in usage limit of 50% of physical memory. Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
author: Darrick J. Wong <darrick.wong@oracle.com> 2020-10-25 17:14:33 -0700
committer: Darrick J. Wong <darrick.wong@oracle.com> 2020-10-26 18:32:13 -0700
commit: 2c64ea508531d52a3a47f3fb7ef842d779449ab2 (patch)
tree: c1a522590dfecac0450c4812df52237b8e1e6321
parent: e8c543a453e230452e1b7878a8e8f79fce749f13 (diff)
8 files changed, 1077 insertions, 0 deletions
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index 9fac5ea8d0e4..7f12b40146b3 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -97,6 +97,7 @@ config XFS_ONLINE_SCRUB
 	bool "XFS online metadata check support"
 	default n
 	depends on XFS_FS
+	depends on TMPFS && SHMEM
 	help
 	  If you say Y here you will be able to check metadata on a
 	  mounted XFS filesystem.  This feature is intended to reduce
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 04611a1068b4..6269de7582b7 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -137,6 +137,7 @@ xfs-y				+= $(addprefix scrub/, \
 				   trace.o \
 				   agheader.o \
 				   alloc.o \
+				   array.o \
 				   attr.o \
 				   bmap.o \
 				   btree.o \
@@ -152,6 +153,7 @@ xfs-y				+= $(addprefix scrub/, \
 				   rmap.o \
 				   scrub.o \
 				   symlink.o \
+				   xfile.o \
 				   )
 
 xfs-$(CONFIG_XFS_RT)		+= scrub/rtbitmap.o
diff --git a/fs/xfs/scrub/array.c b/fs/xfs/scrub/array.c
new file mode 100644
index 000000000000..4b5eb6a0c9bd
--- /dev/null
+++ b/fs/xfs/scrub/array.c
@@ -0,0 +1,665 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2020 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "scrub/array.h"
+#include "scrub/scrub.h"
+#include "scrub/trace.h"
+#include "scrub/xfile.h"
+
+/*
+ * XFS Fixed-Size Big Memory Array
+ * ===============================
+ *
+ * The file-backed memory array uses a memfd "file" to store large numbers of
+ * fixed-size records in memory that can be paged out.  This puts less stress
+ * on the memory reclaim algorithms because memfd file pages are not pinned and
+ * can be paged out; however, array access is less direct than would be in a
+ * regular memory array.  Access to the array is performed via indexed get and
+ * put methods, and an append method is provided for convenience.  Array
+ * elements can be set to all zeroes, which means that the entry is NULL and
+ * will be skipped during iteration.
+ */
+
+/*
+ * Pointer to temp space.  Because we can't access the memfd data directly, we
+ * allocate a small amount of memory on the end of the xfbma to buffer array
+ * items when we need space to store values temporarily.
+ */
+#define XFBMA_MAX_TEMP	(2)
+static inline void *
+xfbma_temp(
+	struct xfbma	*array,
+	unsigned int	idx)
+{
+	ASSERT(idx < XFBMA_MAX_TEMP);
+
+	return ((char *)(array + 1)) + (idx * array->obj_size);
+}
+
+/* Compute array index given an xfile offset. */
+static uint64_t
+xfbma_index(
+	struct xfbma	*array,
+	loff_t		off)
+{
+	return div_u64(off, array->obj_size);
+}
+
+/* Compute xfile offset of array element. */
+static inline loff_t xfbma_offset(struct xfbma *array, uint64_t idx)
+{
+	return idx * array->obj_size;
+}
+
+/*
+ * Initialize a big memory array.  Array records cannot be larger than a
+ * page, and the array cannot span more bytes than the page cache supports.
+ */
+struct xfbma *
+xfbma_init(
+	const char	*description,
+	size_t		obj_size)
+{
+	struct xfbma	*array;
+	struct xfile	*xfile;
+	int		error;
+
+	ASSERT(obj_size < PAGE_SIZE);
+
+	xfile = xfile_create(description, 0);
+	if (IS_ERR(xfile))
+		return ERR_CAST(xfile);
+
+	error = -ENOMEM;
+	array = kmem_alloc(sizeof(struct xfbma) + (XFBMA_MAX_TEMP * obj_size),
+			KM_NOFS | KM_MAYFAIL);
+	if (!array)
+		goto out_xfile;
+
+	array->xfile = xfile;
+	array->obj_size = obj_size;
+	array->nr = 0;
+	array->max_nr = xfbma_index(array, MAX_LFS_FILESIZE);
+	return array;
+out_xfile:
+	xfile_destroy(xfile);
+	return ERR_PTR(error);
+}
+
+/* Destroy the array. */
+void
+xfbma_destroy(
+	struct xfbma	*array)
+{
+	xfile_destroy(array->xfile);
+	kmem_free(array);
+}
+
+/* Get an element from the array. */
+int
+xfbma_get(
+	struct xfbma	*array,
+	uint64_t	idx,
+	void		*ptr)
+{
+	if (idx >= array->nr)
+		return -ENODATA;
+
+	return xfile_pread(array->xfile, ptr, array->obj_size,
+			xfbma_offset(array, idx));
+}
+
+/* Put an element in the array. */
+int
+xfbma_set(
+	struct xfbma	*array,
+	uint64_t	idx,
+	void		*ptr)
+{
+	int		ret;
+
+	if (idx >= array->max_nr)
+		return -EFBIG;
+
+	ret = xfile_pwrite(array->xfile, ptr, array->obj_size,
+			xfbma_offset(array, idx));
+	if (ret)
+		return ret;
+
+	array->nr = max(array->nr, idx + 1);
+	return 0;
+}
+
+/* Is this array element NULL? */
+bool
+xfbma_is_null(
+	struct xfbma	*array,
+	void		*ptr)
+{
+	return !memchr_inv(ptr, 0, array->obj_size);
+}
+
+/* Put an element anywhere in the array that isn't NULL. */
+int
+xfbma_insert_anywhere(
+	struct xfbma	*array,
+	void		*ptr)
+{
+	void		*temp = xfbma_temp(array, 0);
+	uint64_t	i;
+	int		error;
+
+	/* Find a null slot to put it in. */
+	for (i = 0; i < array->nr; i++) {
+		error = xfbma_get(array, i, temp);
+		if (error || !xfbma_is_null(array, temp))
+			continue;
+		return xfbma_set(array, i, ptr);
+	}
+
+	/* No null slots, just dump it on the end. */
+	return xfbma_append(array, ptr);
+}
+
+/* NULL an element in the array. */
+int
+xfbma_nullify(
+	struct xfbma	*array,
+	uint64_t	idx)
+{
+	void		*temp = xfbma_temp(array, 0);
+
+	memset(temp, 0, array->obj_size);
+	return xfbma_set(array, idx, temp);
+}
+
+/* Return length of array. */
+uint64_t
+xfbma_length(
+	struct xfbma	*array)
+{
+	return array->nr;
+}
+
+/*
+ * Select the median value from a[lo], a[mid], and a[hi].  Put the median in
+ * a[lo], the lowest in a[lo], and the highest in a[hi].  Using the median of
+ * the three reduces the chances that we pick the worst case pivot value, since
+ * it's likely that our array values are nearly sorted.
+ */
+STATIC int
+xfbma_qsort_pivot(
+	struct xfbma	*array,
+	xfbma_cmp_fn	cmp_fn,
+	uint64_t	lo,
+	uint64_t	mid,
+	uint64_t	hi)
+{
+	void		*a = xfbma_temp(array, 0);
+	void		*b = xfbma_temp(array, 1);
+	int		error;
+
+	/* if a[mid] < a[lo], swap a[mid] and a[lo]. */
+	error = xfbma_get(array, mid, a);
+	if (error)
+		return error;
+	error = xfbma_get(array, lo, b);
+	if (error)
+		return error;
+	if (cmp_fn(a, b) < 0) {
+		error = xfbma_set(array, lo, a);
+		if (error)
+			return error;
+		error = xfbma_set(array, mid, b);
+		if (error)
+			return error;
+	}
+
+	/* if a[hi] < a[mid], swap a[mid] and a[hi]. */
+	error = xfbma_get(array, hi, a);
+	if (error)
+		return error;
+	error = xfbma_get(array, mid, b);
+	if (error)
+		return error;
+	if (cmp_fn(a, b) < 0) {
+		error = xfbma_set(array, mid, a);
+		if (error)
+			return error;
+		error = xfbma_set(array, hi, b);
+		if (error)
+			return error;
+	} else {
+		goto move_front;
+	}
+
+	/* if a[mid] < a[lo], swap a[mid] and a[lo]. */
+	error = xfbma_get(array, mid, a);
+	if (error)
+		return error;
+	error = xfbma_get(array, lo, b);
+	if (error)
+		return error;
+	if (cmp_fn(a, b) < 0) {
+		error = xfbma_set(array, lo, a);
+		if (error)
+			return error;
+		error = xfbma_set(array, mid, b);
+		if (error)
+			return error;
+	}
+move_front:
+	/* move our selected pivot to a[lo] */
+	error = xfbma_get(array, lo, b);
+	if (error)
+		return error;
+	error = xfbma_get(array, mid, a);
+	if (error)
+		return error;
+	error = xfbma_set(array, mid, b);
+	if (error)
+		return error;
+	return xfbma_set(array, lo, a);
+}
+
+/*
+ * Perform an insertion sort on a subset of the array.
+ * Though insertion sort is an O(n^2) algorithm, for small set sizes it's
+ * faster than quicksort's stack machine, so we let it take over for that.
+ */
+STATIC int
+xfbma_isort(
+	struct xfbma	*array,
+	xfbma_cmp_fn	cmp_fn,
+	uint64_t	start,
+	uint64_t	end)
+{
+	void		*a = xfbma_temp(array, 0);
+	void		*b = xfbma_temp(array, 1);
+	uint64_t	tmp;
+	uint64_t	i;
+	uint64_t	run;
+	int		error;
+
+	/*
+	 * Move the smallest element in a[start..end] to a[start].  This
+	 * simplifies the loop control logic below.
+	 */
+	tmp = start;
+	error = xfbma_get(array, tmp, b);
+	if (error)
+		return error;
+	for (run = start + 1; run <= end; run++) {
+		/* if a[run] < a[tmp], tmp = run */
+		error = xfbma_get(array, run, a);
+		if (error)
+			return error;
+		if (cmp_fn(a, b) < 0) {
+			tmp = run;
+			memcpy(b, a, array->obj_size);
+		}
+	}
+
+	/*
+	 * The smallest element is a[tmp]; swap with a[start] if tmp != start.
+	 * Recall that a[tmp] is already in *b.
+	 */
+	if (tmp != start) {
+		error = xfbma_get(array, start, a);
+		if (error)
+			return error;
+		error = xfbma_set(array, tmp, a);
+		if (error)
+			return error;
+		error = xfbma_set(array, start, b);
+		if (error)
+			return error;
+	}
+
+	/*
+	 * Perform an insertion sort on a[start+1..end].  We already made sure
+	 * that the smallest value in the original range is now in a[start],
+	 * so the inner loop should never underflow.
+	 *
+	 * For each a[start+2..end], make sure it's in the correct position
+	 * with respect to the elements that came before it.
+	 */
+	for (run = start + 2; run <= end; run++) {
+		error = xfbma_get(array, run, a);
+		if (error)
+			return error;
+
+		/*
+		 * Find the correct place for a[run] by walking leftwards
+		 * towards the start of the range until a[tmp] is no longer
+		 * greater than a[run].
+		 */
+		tmp = run - 1;
+		error = xfbma_get(array, tmp, b);
+		if (error)
+			return error;
+		while (cmp_fn(a, b) < 0) {
+			tmp--;
+			error = xfbma_get(array, tmp, b);
+			if (error)
+				return error;
+		}
+		tmp++;
+
+		/*
+		 * If tmp != run, then a[tmp..run-1] are all less than a[run],
+		 * so right barrel roll a[tmp..run] to get this range in
+		 * sorted order.
+		 */
+		if (tmp == run)
+			continue;
+
+		for (i = run; i >= tmp; i--) {
+			error = xfbma_get(array, i - 1, b);
+			if (error)
+				return error;
+			error = xfbma_set(array, i, b);
+			if (error)
+				return error;
+		}
+		error = xfbma_set(array, tmp, a);
+		if (error)
+			return error;
+	}
+
+	return 0;
+}
+
+/*
+ * Sort the array elements via quicksort.  This implementation incorporates
+ * four optimizations discussed in Sedgewick:
+ *
+ * 1. Use an explicit stack of array indicies to store the next array
+ *    partition to sort.  This helps us to avoid recursion in the call stack,
+ *    which is particularly expensive in the kernel.
+ *
+ * 2. Choose the pivot element using a median-of-three decision tree.  This
+ *    reduces the probability of selecting a bad pivot value which causes
+ *    worst case behavior (i.e. partition sizes of 1).  Chance are fairly good
+ *    that the list is nearly sorted, so this is important.
+ *
+ * 3. The smaller of the two sub-partitions is pushed onto the stack to start
+ *    the next level of recursion, and the larger sub-partition replaces the
+ *    current stack frame.  This guarantees that we won't need more than
+ *    log2(nr) stack space.
+ *
+ * 4. Use insertion sort for small sets since since insertion sort is faster
+ *    for small, mostly sorted array segments.  In the author's experience,
+ *    substituting insertion sort for arrays smaller than 4 elements yields
+ *    a ~10% reduction in runtime.
+ */
+
+/*
+ * Due to the use of signed indices, we can only support up to 2^63 records.
+ * Files can only grow to 2^63 bytes, so this is not much of a limitation.
+ */
+#define QSORT_MAX_RECS		(1ULL << 63)
+
+/*
+ * For array subsets smaller than 4 elements, it's slightly faster to use
+ * insertion sort than quicksort's stack machine.
+ */
+#define ISORT_THRESHOLD		(4)
+int
+xfbma_sort(
+	struct xfbma	*array,
+	xfbma_cmp_fn	cmp_fn)
+{
+	int64_t		*stack;
+	int64_t		*beg;
+	int64_t		*end;
+	void		*pivot = xfbma_temp(array, 0);
+	void		*temp = xfbma_temp(array, 1);
+	int64_t		lo, mid, hi;
+	const int	max_stack_depth = ilog2(array->nr) + 1;
+	int		stack_depth = 0;
+	int		max_stack_used = 0;
+	int		error = 0;
+
+	if (array->nr == 0)
+		return 0;
+	if (array->nr >= QSORT_MAX_RECS)
+		return -E2BIG;
+	if (array->nr <= ISORT_THRESHOLD)
+		return xfbma_isort(array, cmp_fn, 0, array->nr - 1);
+
+	/* Allocate our pointer stacks for sorting. */
+	stack = kmem_alloc(sizeof(int64_t) * 2 * max_stack_depth,
+			KM_NOFS | KM_MAYFAIL);
+	if (!stack)
+		return -ENOMEM;
+	beg = stack;
+	end = &stack[max_stack_depth];
+
+	beg[0] = 0;
+	end[0] = array->nr;
+	while (stack_depth >= 0) {
+		lo = beg[stack_depth];
+		hi = end[stack_depth] - 1;
+
+		/* Nothing left in this partition to sort; pop stack. */
+		if (lo >= hi) {
+			stack_depth--;
+			continue;
+		}
+
+		/* Small enough for insertion sort? */
+		if (hi - lo <= ISORT_THRESHOLD) {
+			error = xfbma_isort(array, cmp_fn, lo, hi);
+			if (error)
+				goto out_free;
+			stack_depth--;
+			continue;
+		}
+
+		/* Pick a pivot, move it to a[lo] and stash it. */
+		mid = lo + ((hi - lo) / 2);
+		error = xfbma_qsort_pivot(array, cmp_fn, lo, mid, hi);
+		if (error)
+			goto out_free;
+
+		error = xfbma_get(array, lo, pivot);
+		if (error)
+			goto out_free;
+
+		/*
+		 * Rearrange a[lo..hi] such that everything smaller than the
+		 * pivot is on the left side of the range and everything larger
+		 * than the pivot is on the right side of the range.
+		 */
+		while (lo < hi) {
+			/*
+			 * Decrement hi until it finds an a[hi] less than the
+			 * pivot value.
+			 */
+			error = xfbma_get(array, hi, temp);
+			if (error)
+				goto out_free;
+			while (cmp_fn(temp, pivot) >= 0 && lo < hi) {
+				hi--;
+				error = xfbma_get(array, hi, temp);
+				if (error)
+					goto out_free;
+			}
+
+			/* Copy that item (a[hi]) to a[lo]. */
+			if (lo < hi) {
+				error = xfbma_set(array, lo++, temp);
+				if (error)
+					goto out_free;
+			}
+
+			/*
+			 * Increment lo until it finds an a[lo] greater than
+			 * the pivot value.
+			 */
+			error = xfbma_get(array, lo, temp);
+			if (error)
+				goto out_free;
+			while (cmp_fn(temp, pivot) <= 0 && lo < hi) {
+				lo++;
+				error = xfbma_get(array, lo, temp);
+				if (error)
+					goto out_free;
+			}
+
+			/* Copy that item (a[lo]) to a[hi]. */
+			if (lo < hi) {
+				error = xfbma_set(array, hi--, temp);
+				if (error)
+					goto out_free;
+			}
+		}
+
+		/*
+		 * Put our pivot value in the correct place at a[lo].  All
+		 * values between a[beg[i]] and a[lo - 1] should be less than
+		 * the pivot; and all values between a[lo + 1] and a[end[i]-1]
+		 * should be greater than the pivot.
+		 */
+		error = xfbma_set(array, lo, pivot);
+		if (error)
+			goto out_free;
+
+		/*
+		 * Set up the pointers for the next iteration.  We push onto
+		 * the stack all of the unsorted values between a[lo + 1] and
+		 * a[end[i]], and we tweak the current stack frame to point to
+		 * the unsorted values between a[beg[i]] and a[lo] so that
+		 * those values will be sorted when we pop the stack.
+		 */
+		beg[stack_depth + 1] = lo + 1;
+		end[stack_depth + 1] = end[stack_depth];
+		end[stack_depth++] = lo;
+
+		/* Check our stack usage. */
+		max_stack_used = max(max_stack_used, stack_depth);
+		if (stack_depth >= max_stack_depth) {
+			ASSERT(0);
+			error = -EFSCORRUPTED;
+			goto out_free;
+		}
+
+		/*
+		 * Always start with the smaller of the two partitions to keep
+		 * the amount of recursion in check.
+		 */
+		if (end[stack_depth] - beg[stack_depth] >
+		    end[stack_depth - 1] - beg[stack_depth - 1]) {
+			swap(beg[stack_depth], beg[stack_depth - 1]);
+			swap(end[stack_depth], end[stack_depth - 1]);
+		}
+	}
+
+out_free:
+	kfree(stack);
+	trace_xfbma_sort_stats(array->nr, max_stack_depth, max_stack_used,
+			error);
+	return error;
+}
+
+/*
+ * Decide which array item we're going to read as part of an _iter_get.
+ * @cur is the array index, and @pos is the file offset of that array index in
+ * the backing xfile.  Returns ENODATA if we reach the end of the records.
+ *
+ * Reading from a hole in a sparse xfile causes page instantiation, so for
+ * iterating a (possibly sparse) array we need to figure out if the cursor is
+ * pointing at a totally uninitialized hole and move the cursor up if
+ * necessary.
+ */
+static inline int
+xfbma_find_data(
+	struct xfbma	*array,
+	uint64_t	*cur,
+	loff_t		*pos)
+{
+	unsigned int	pgoff = offset_in_page(*pos);
+	loff_t		end_pos = *pos + array->obj_size - 1;
+	loff_t		new_pos;
+
+	/*
+	 * If the current array record is not adjacent to a page boundary, we
+	 * are in the middle of the page.  We do not need to move the cursor.
+	 */
+	if (pgoff != 0 && pgoff + array->obj_size - 1 < PAGE_SIZE)
+		return 0;
+
+	/*
+	 * Call SEEK_DATA on the last byte in the record we're about to read.
+	 * If the record ends at (or crosses) the end of a page then we know
+	 * that the first byte of the record is backed by pages and don't need
+	 * to query it.  If instead the record begins at the start of the page
+	 * then we know that querying the last byte is just as good as querying
+	 * the first byte, since records cannot be larger than a page.
+	 *
+	 * If the call returns the same file offset, we know this record is
+	 * backed by real pages.  We do not need to move the cursor.
+	 */
+	new_pos = xfile_seek_data(array->xfile, end_pos);
+	if (new_pos == -ENXIO)
+		return -ENODATA;
+	if (new_pos < 0)
+		return new_pos;
+	if (new_pos == end_pos)
+		return 0;
+
+	/*
+	 * Otherwise, SEEK_DATA told us how far up to move the file pointer to
+	 * find more data.  Move the array index to the first record past the
+	 * byte offset we were given.
+	 */
+	new_pos = roundup_64(new_pos, array->obj_size);
+	*cur = xfbma_index(array, new_pos);
+	*pos = xfbma_offset(array, *cur);
+	return 0;
+}
+
+/*
+ * Starting at *idx, fetch the next non-null array entry and advance the index
+ * to set up the next _iter_get call.  Returns ENODATA if we reach the end of
+ * the array.
+ */
+int
+xfbma_iter_get(
+	struct xfbma	*array,
+	uint64_t	*idx,
+	void		*rec)
+{
+	uint64_t	cur = *idx;
+	loff_t		off = xfbma_offset(array, cur);
+	int		error;
+
+	do {
+		if (cur >= array->nr)
+			return -ENODATA;
+
+		/*
+		 * Ask the backing store for the location of next possible
+		 * written record, then retrieve that record.
+		 */
+		error = xfbma_find_data(array, &cur, &off);
+		if (error)
+			return error;
+		error = xfbma_get(array, cur, rec);
+		if (error)
+			return error;
+
+		cur++;
+		off += array->obj_size;
+	} while (xfbma_is_null(array, rec));
+
+	*idx = cur;
+	return 0;
+}
diff --git a/fs/xfs/scrub/array.h b/fs/xfs/scrub/array.h
new file mode 100644
index 000000000000..0a7599825579
--- /dev/null
+++ b/fs/xfs/scrub/array.h
@@ -0,0 +1,49 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2020 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#ifndef __XFS_SCRUB_ARRAY_H__
+#define __XFS_SCRUB_ARRAY_H__
+
+struct xfbma {
+	/* Underlying file that backs the array. */
+	struct xfile	*xfile;
+
+	/* Number of array elements. */
+	uint64_t	nr;
+
+	/* Maximum possible array size. */
+	uint64_t	max_nr;
+
+	/* Size of an array element. */
+	size_t		obj_size;
+};
+
+struct xfbma *xfbma_init(const char *descr, size_t obj_size);
+void xfbma_destroy(struct xfbma *array);
+int xfbma_get(struct xfbma *array, uint64_t idx, void *ptr);
+int xfbma_set(struct xfbma *array, uint64_t idx, void *ptr);
+int xfbma_insert_anywhere(struct xfbma *array, void *ptr);
+bool xfbma_is_null(struct xfbma *array, void *ptr);
+int xfbma_nullify(struct xfbma *array, uint64_t idx);
+
+/* Append an element to the array. */
+static inline int xfbma_append(struct xfbma *array, void *ptr)
+{
+	return xfbma_set(array, array->nr, ptr);
+}
+
+uint64_t xfbma_length(struct xfbma *array);
+int xfbma_iter_get(struct xfbma *array, uint64_t *idx, void *rec);
+
+typedef int (*xfbma_cmp_fn)(const void *a, const void *b);
+
+int xfbma_sort(struct xfbma *array, xfbma_cmp_fn cmp_fn);
+
+#define foreach_xfbma_item(array, i, rec) \
+	for ((i) = 0; (i) < xfbma_length((array)); (i)++) \
+		if (xfbma_get((array), (i), &(rec)) == 0 && \
+		    !xfbma_is_null((array), &(rec)))
+
+#endif /* __XFS_SCRUB_ARRAY_H__ */
diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c
index 2c6c248be823..b3bc39411889 100644
--- a/fs/xfs/scrub/trace.c
+++ b/fs/xfs/scrub/trace.c
@@ -13,6 +13,7 @@
 #include "xfs_inode.h"
 #include "xfs_btree.h"
 #include "scrub/scrub.h"
+#include "scrub/xfile.h"
 
 /* Figure out which block the btree cursor was pointing to. */
 static inline xfs_fsblock_t
diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
index 3a816c4d0f9e..a1ad712f09e9 100644
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -12,6 +12,8 @@
 #include <linux/tracepoint.h>
 #include "xfs_bit.h"
 
+struct xfile;
+
 /*
  * ftrace's __print_symbolic requires that all enum values be wrapped in the
  * TRACE_DEFINE_ENUM macro so that the enum value can be encoded in the ftrace
@@ -651,6 +653,90 @@ TRACE_EVENT(xchk_fscounters_within_range,
 		  __entry->old_value)
 )
 
+TRACE_EVENT(xfile_create,
+	TP_PROTO(struct xfile *xf),
+	TP_ARGS(xf),
+	TP_STRUCT__entry(
+		__field(unsigned long, ino)
+		__array(char, pathname, 256)
+	),
+	TP_fast_assign(
+		char		pathname[257];
+		char		*path;
+
+		__entry->ino = file_inode(xf->filp)->i_ino;
+		memset(pathname, 0, sizeof(pathname));
+		path = file_path(xf->filp, pathname, sizeof(pathname) - 1);
+		if (IS_ERR(path))
+			path = "(unknown)";
+		strncpy(__entry->pathname, path, sizeof(__entry->pathname));
+	),
+	TP_printk("ino %lu path %s",
+		  __entry->ino,
+		  __entry->pathname)
+);
+
+DECLARE_EVENT_CLASS(xfile_class,
+	TP_PROTO(struct xfile *xf, loff_t offset, long long count),
+	TP_ARGS(xf, offset, count),
+	TP_STRUCT__entry(
+		__field(unsigned long, ino)
+		__field(long long, bytes)
+		__field(loff_t, offset)
+		__field(long long, count)
+	),
+	TP_fast_assign(
+		struct kstat	statbuf;
+		int		ret;
+
+		ret = xfile_statx(xf, &statbuf);
+		if (!ret)
+			__entry->bytes = statbuf.blocks * 512;
+		else
+			__entry->bytes = -1;
+		__entry->ino = file_inode(xf->filp)->i_ino;
+		__entry->offset = offset;
+		__entry->count = count;
+	),
+	TP_printk("ino %lu mem_usage %lld offset %lld count %lld",
+		  __entry->ino,
+		  __entry->bytes,
+		  __entry->offset,
+		  __entry->count)
+);
+#define DEFINE_XFILE_EVENT(name) \
+DEFINE_EVENT(xfile_class, name, \
+	TP_PROTO(struct xfile *xf, loff_t offset, long long count), \
+	TP_ARGS(xf, offset, count))
+DEFINE_XFILE_EVENT(xfile_destroy);
+DEFINE_XFILE_EVENT(xfile_discard);
+DEFINE_XFILE_EVENT(xfile_pread);
+DEFINE_XFILE_EVENT(xfile_pwrite);
+DEFINE_XFILE_EVENT(xfile_seek_data);
+
+TRACE_EVENT(xfbma_sort_stats,
+	TP_PROTO(uint64_t nr, unsigned int max_stack_depth,
+		 unsigned int max_stack_used, int error),
+	TP_ARGS(nr, max_stack_depth, max_stack_used, error),
+	TP_STRUCT__entry(
+		__field(uint64_t, nr)
+		__field(unsigned int, max_stack_depth)
+		__field(unsigned int, max_stack_used)
+		__field(int, error)
+	),
+	TP_fast_assign(
+		__entry->nr = nr;
+		__entry->max_stack_depth = max_stack_depth;
+		__entry->max_stack_used = max_stack_used;
+		__entry->error = error;
+	),
+	TP_printk("nr %llu max_depth %u max_used %u error %d",
+		  __entry->nr,
+		  __entry->max_stack_depth,
+		  __entry->max_stack_used,
+		  __entry->error)
+);
+
 /* repair tracepoints */
 #if IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR)
 
diff --git a/fs/xfs/scrub/xfile.c b/fs/xfs/scrub/xfile.c
new file mode 100644
index 000000000000..a2d54ce83091
--- /dev/null
+++ b/fs/xfs/scrub/xfile.c
@@ -0,0 +1,250 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2020 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "scrub/array.h"
+#include "scrub/scrub.h"
+#include "scrub/trace.h"
+#include "scrub/xfile.h"
+#include <linux/shmem_fs.h>
+
+/*
+ * Swappable Temporary Memory
+ * ==========================
+ *
+ * Online checking sometimes needs to be able to stage a large amount of data
+ * in memory.  This information might not fit in the available memory and it
+ * doesn't all need to be accessible at all times.  In other words, we want an
+ * indexed data buffer to store data that can be paged out.
+ *
+ * When CONFIG_TMPFS=y, shmemfs is enough of a filesystem to meet those
+ * requirements.  Therefore, the xfile mechanism uses an unlinked shmem file to
+ * store our staging data.  This file is not installed in the file descriptor
+ * table so that user programs cannot access the data, which means that the
+ * xfile must be freed with xfile_destroy.
+ *
+ * xfiles assume that the caller will handle all required concurrency
+ * management; standard vfs locks (freezer and inode) are not taken.  Reads
+ * and writes are satisfied directly from the page cache.
+ *
+ * NOTE: The current shmemfs implementation has a quirk that in-kernel reads
+ * from a hole causes a page to be mapped into the file.  If you are going to
+ * create a sparse xfile, please be careful about reading from uninitialized
+ * parts of the file.
+ */
+
+/*
+ * xfiles must not be exposed to userspace and require upper layers to
+ * coordinate access to the one handle returned by the constructor, so
+ * establish a separate lock class for xfiles to avoid confusing lockdep.
+ */
+static struct lock_class_key xfile_i_mutex_key;
+
+/*
+ * Create an xfile of the given size.  The description will be used in the
+ * trace output.
+ */
+struct xfile *
+xfile_create(
+	const char		*description,
+	loff_t			size)
+{
+	struct xfile		*xf;
+
+	xf = kmem_alloc(sizeof(struct xfile), KM_MAYFAIL);
+	if (!xf)
+		return ERR_PTR(-ENOMEM);
+
+	xf->filp = shmem_file_setup(description, size, 0);
+	if (!xf->filp) {
+		kmem_free(xf);
+		return ERR_PTR(-ENOMEM);
+	}
+	if (IS_ERR(xf->filp)) {
+		int	ret = PTR_ERR(xf->filp);
+
+		kmem_free(xf);
+		return ERR_PTR(ret);
+	}
+
+	/*
+	 * We want a large sparse file that we can pread, pwrite, and seek.
+	 * xfile users are responsible for keeping the xfile hidden away from
+	 * all other callers, so we skip timestamp updates and security checks.
+	 */
+	xf->filp->f_mode |= FMODE_PREAD | FMODE_PWRITE | FMODE_NOCMTIME |
+			    FMODE_LSEEK;
+	xf->filp->f_flags |= O_RDWR | O_LARGEFILE | O_NOATIME;
+	xf->filp->f_inode->i_flags |= S_PRIVATE | S_NOCMTIME | S_NOATIME;
+
+	lockdep_set_class(&file_inode(xf->filp)->i_rwsem, &xfile_i_mutex_key);
+
+	trace_xfile_create(xf);
+	return xf;
+}
+
+/* Close the file and release all resources. */
+void
+xfile_destroy(
+	struct xfile		*xf)
+{
+	struct inode		*inode = file_inode(xf->filp);
+
+	trace_xfile_destroy(xf, 0, 0);
+
+	lockdep_set_class(&inode->i_rwsem, &inode->i_sb->s_type->i_mutex_key);
+	fput(xf->filp);
+}
+
+/* Read a buffer directly from the xfile's page cache. */
+int
+xfile_pread(
+	struct xfile		*xf,
+	void			*buf,
+	size_t			count,
+	loff_t			offset)
+{
+	struct inode		*inode = file_inode(xf->filp);
+	struct address_space	*mapping = inode->i_mapping;
+	struct page		*page = NULL;
+	unsigned int		pflags;
+
+	if (count > MAX_RW_COUNT)
+		return -ENOMEM;
+	if (inode->i_sb->s_maxbytes - offset < count)
+		return -ENOMEM;
+
+	trace_xfile_pread(xf, offset, count);
+
+	pflags = memalloc_nofs_save();
+	while (count > 0) {
+		void		*xfaddr;
+		unsigned int	len;
+
+		len = min_t(ssize_t, count, PAGE_SIZE - offset_in_page(offset));
+
+		page = shmem_read_mapping_page_gfp(mapping,
+				offset >> PAGE_SHIFT, __GFP_NOWARN);
+		if (IS_ERR(page))
+			break;
+
+		if (page) {
+			if (!PageUptodate(page)) {
+				put_page(page);
+				memset(buf, 0, len);
+			} else {
+				xfaddr = page_address(page) +
+					 offset_in_page(offset);
+				memcpy(buf, xfaddr, len);
+				put_page(page);
+			}
+		} else {
+			memset(buf, 0, len);
+		}
+
+		count -= len;
+		offset += len;
+		buf += len;
+	}
+	memalloc_nofs_restore(pflags);
+
+	/*
+	 * Since we're treating this file as "memory", any IO error or short
+	 * read should be treated as a failure to allocate memory.
+	 */
+	return count > 0 ? -ENOMEM : 0;
+}
+
+/* Write a buffer directly to the xfile's page cache. */
+int
+xfile_pwrite(
+	struct xfile		*xf,
+	void			*buf,
+	size_t			count,
+	loff_t			offset)
+{
+	struct inode		*inode = file_inode(xf->filp);
+	struct address_space	*mapping = inode->i_mapping;
+	struct page		*page = NULL;
+	unsigned int		pflags;
+	int			error;
+
+	if (count > MAX_RW_COUNT)
+		return -ENOMEM;
+	if (inode->i_sb->s_maxbytes - offset < count)
+		return -ENOMEM;
+
+	trace_xfile_pwrite(xf, offset, count);
+
+	pflags = memalloc_nofs_save();
+	while (count > 0) {
+		void		*fsdata;
+		void		*xfaddr;
+		unsigned int	len;
+
+		len = min_t(ssize_t, count, PAGE_SIZE - offset_in_page(offset));
+
+		error = pagecache_write_begin(NULL, mapping, offset, len,
+				AOP_FLAG_NOFS, &page, &fsdata);
+		if (error)
+			break;
+
+		xfaddr = page_address(page) + offset_in_page(offset);
+		memcpy(xfaddr, buf, len);
+
+		error = pagecache_write_end(NULL, mapping, offset, len, len,
+				page, fsdata);
+		if (error < 0)
+			break;
+
+		count -= len;
+		offset += len;
+		buf += len;
+	}
+	memalloc_nofs_restore(pflags);
+
+	/*
+	 * Since we're treating this file as "memory", any IO error or short
+	 * write should be treated as a failure to allocate memory.
+	 */
+	return count > 0 ? -ENOMEM : 0;
+}
+
+/* Discard pages backing a range of the xfile. */
+void
+xfile_discard(
+	struct xfile		*xf,
+	loff_t			start,
+	loff_t			end)
+{
+	trace_xfile_discard(xf, start, end);
+	shmem_truncate_range(file_inode(xf->filp), start, end);
+}
+
+/* Find the next batch of xfile data for a given offset. */
+loff_t
+xfile_seek_data(
+	struct xfile		*xf,
+	loff_t			pos)
+{
+	loff_t			ret;
+
+	ret = vfs_llseek(xf->filp, pos, SEEK_DATA);
+	trace_xfile_seek_data(xf, pos, ret);
+	return ret;
+}
+
+/* Query statx information for an xfile. */
+int
+xfile_statx(
+	struct xfile		*xf,
+	struct kstat		*statbuf)
+{
+	return vfs_getattr_nosec(&xf->filp->f_path, statbuf,
+			STATX_SIZE | STATX_BLOCKS, AT_STATX_DONT_SYNC);
+}
diff --git a/fs/xfs/scrub/xfile.h b/fs/xfs/scrub/xfile.h
new file mode 100644
index 000000000000..90e40229ca81
--- /dev/null
+++ b/fs/xfs/scrub/xfile.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2020 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#ifndef __XFS_SCRUB_XFILE_H__
+#define __XFS_SCRUB_XFILE_H__
+
+struct xfile {
+	struct file	*filp;
+};
+
+struct xfile *xfile_create(const char *description, loff_t size);
+void xfile_destroy(struct xfile *xf);
+
+int xfile_pread(struct xfile *xf, void *buf, size_t count, loff_t offset);
+int xfile_pwrite(struct xfile *xf, void *buf, size_t count, loff_t offset);
+
+void xfile_discard(struct xfile *xf, loff_t start, loff_t end);
+loff_t xfile_seek_data(struct xfile *xf, loff_t pos);
+int xfile_statx(struct xfile *xf, struct kstat *statbuf);
+
+#endif /* __XFS_SCRUB_XFILE_H__ */
author	Darrick J. Wong <darrick.wong@oracle.com>	2020-10-25 17:14:33 -0700
committer	Darrick J. Wong <darrick.wong@oracle.com>	2020-10-26 18:32:13 -0700
commit	2c64ea508531d52a3a47f3fb7ef842d779449ab2 (patch)
tree	c1a522590dfecac0450c4812df52237b8e1e6321
parent	e8c543a453e230452e1b7878a8e8f79fce749f13 (diff)