summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
authorDarrick J. Wong <djwong@kernel.org>2021-09-17 16:27:43 -0700
committerDarrick J. Wong <djwong@kernel.org>2021-09-23 18:20:32 -0700
commit0e17ca79c379990cb88d363529a36fa8f8c0df6d (patch)
tree9d3cd2b4dba0721590e3a87bc8d0dd4409dd4846 /fs
parent4f90e6f3be1420ff85e3bed71a5d3cede9a23ccd (diff)
dax: prepare pmem for use by zero-initializing contents and clearing poisons
Our current "advice" to people using persistent memory and FSDAX who wish to recover upon receipt of a media error (aka 'hwpoison') event from ACPI is to punch-hole that part of the file and then pwrite it, which will magically cause the pmem to be reinitialized and the poison to be cleared. Punching doesn't make any sense at all -- the (re)allocation on pwrite does not permit the caller to specify where to find blocks, which means that we might not get the same pmem back. This pushes the user farther away from the goal of reinitializing poisoned memory and leads to user complaints about unnecessary file fragmentation. AFAICT, the only reason why the "punch and write" dance works at all is that the XFS and ext4 currently call blkdev_issue_zeroout when allocating pmem ahead of a write call. Even a regular overwrite won't clear the poison, because dax_direct_access is smart enough to bail out on poisoned pmem, but not smart enough to clear it. To be fair, that function maps pages and has no idea what kinds of reads and writes the caller might want to perform. Therefore, create a dax_zeroinit_range function that filesystems can to reset the pmem contents to zero and clear hardware media error flags. This uses the dax page zeroing helper function, which should ensure that subsequent accesses will not trip over any pre-existing media errors. Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Diffstat (limited to 'fs')
-rw-r--r--fs/dax.c99
1 files changed, 99 insertions, 0 deletions
diff --git a/fs/dax.c b/fs/dax.c
index 4e3e5a283a91..5d3dc74c7cf8 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1714,3 +1714,102 @@ vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf,
return dax_insert_pfn_mkwrite(vmf, pfn, order);
}
EXPORT_SYMBOL_GPL(dax_finish_sync_fault);
+
+static loff_t
+dax_zeroinit_iter(struct iomap_iter *iter)
+{
+ struct iomap *iomap = &iter->iomap;
+ const struct iomap *srcmap = iomap_iter_srcmap(iter);
+ const u64 start = iomap->addr + iter->pos - iomap->offset;
+ const u64 nr_bytes = iomap_length(iter);
+ u64 start_page = start >> PAGE_SHIFT;
+ u64 nr_pages = nr_bytes >> PAGE_SHIFT;
+ int ret;
+
+ if (!iomap->dax_dev)
+ return -ECANCELED;
+
+ /*
+ * The physical extent must be page aligned because that's what the dax
+ * function requires.
+ */
+ if (!PAGE_ALIGNED(start | nr_bytes))
+ return -ECANCELED;
+
+ /*
+ * The dax function, by using pgoff_t, is stuck with unsigned long, so
+ * we must check for overflows.
+ */
+ if (start_page >= ULONG_MAX || start_page + nr_pages > ULONG_MAX)
+ return -ECANCELED;
+
+ /* Must be able to zero storage directly without fs intervention. */
+ if (iomap->flags & IOMAP_F_SHARED)
+ return -ECANCELED;
+ if (srcmap != iomap)
+ return -ECANCELED;
+
+ switch (iomap->type) {
+ case IOMAP_MAPPED:
+ ret = dax_zero_page_range(iomap->dax_dev, start_page,
+ nr_pages);
+ if (ret)
+ return ret;
+ fallthrough;
+ case IOMAP_UNWRITTEN:
+ /*
+ * Filesystems are required to zero and convert unwritten
+ * extent records to written records before writing, so we
+ * can skip unwritten extents even though writing doesn't
+ * clear poison.
+ */
+ fallthrough;
+ case IOMAP_HOLE:
+ /*
+ * Filesystems are required to zero freshly allocated extents
+ * before mapping them into the file prior to writing.
+ */
+ return nr_bytes;
+ }
+
+ /* Inline data and delalloc extents aren't supposed to happen. */
+ return -ECANCELED;
+}
+
+/*
+ * Initialize storage mapped to a DAX-mode file to a known value and ensure the
+ * media are ready to accept read and write commands. This requires the use of
+ * the dax layer's zero page range function to write zeroes to a pmem region
+ * and to reset any hardware media error state.
+ *
+ * The physical extents must be aligned to page size. The file must be backed
+ * by a pmem device. The extents returned must not require copy on write (or
+ * any other mapping interventions from the filesystem) and must be contiguous.
+ * @done will be set to true if the reset succeeded.
+ *
+ * Returns 0 if the zero initialization succeeded, -ECANCELED if the storage
+ * mappings do not support zero initialization, -EOPNOTSUPP if the device does
+ * not support it, or the usual negative errno.
+ */
+int
+dax_zeroinit_range(struct inode *inode, loff_t pos, u64 len,
+ const struct iomap_ops *ops)
+{
+ struct iomap_iter iter = {
+ .inode = inode,
+ .pos = pos,
+ .len = len,
+ .flags = IOMAP_REPORT,
+ };
+ int ret;
+
+ if (!IS_DAX(inode))
+ return -EINVAL;
+ if (pos + len > i_size_read(inode))
+ return -EINVAL;
+
+ while ((ret = iomap_iter(&iter, ops)) > 0)
+ iter.processed = dax_zeroinit_iter(&iter);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(dax_zeroinit_range);