// SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2022 Oracle. All Rights Reserved. * Author: Darrick J. Wong */ #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_format.h" #include "scrub/xfarray.h" #include "scrub/scrub.h" #include "scrub/trace.h" #include "scrub/xfile.h" #include /* * Swappable Temporary Memory * ========================== * * Online checking sometimes needs to be able to stage a large amount of data * in memory. This information might not fit in the available memory and it * doesn't all need to be accessible at all times. In other words, we want an * indexed data buffer to store data that can be paged out. * * When CONFIG_TMPFS=y, shmemfs is enough of a filesystem to meet those * requirements. Therefore, the xfile mechanism uses an unlinked shmem file to * store our staging data. This file is not installed in the file descriptor * table so that user programs cannot access the data, which means that the * xfile must be freed with xfile_destroy. * * xfiles assume that the caller will handle all required concurrency * management; standard vfs locks (freezer and inode) are not taken. Reads * and writes are satisfied directly from the page cache. * * NOTE: The current shmemfs implementation has a quirk that in-kernel reads * of a hole cause a page to be mapped into the file. If you are going to * create a sparse xfile, please be careful about reading from uninitialized * parts of the file. These pages are !Uptodate and will eventually be * reclaimed if not written, but in the short term this boosts memory * consumption. */ /* * xfiles must not be exposed to userspace and require upper layers to * coordinate access to the one handle returned by the constructor, so * establish a separate lock class for xfiles to avoid confusing lockdep. */ static struct lock_class_key xfile_i_mutex_key; /* * Create an xfile of the given size. The description will be used in the * trace output. */ int xfile_create( struct xfs_mount *mp, const char *description, loff_t isize, struct xfile **xfilep) { char *fname; struct xfile *xf; int error = -ENOMEM; xf = kmalloc(sizeof(struct xfile), XCHK_GFP_FLAGS); if (!xf) return -ENOMEM; fname = kmalloc(MAXNAMELEN, XCHK_GFP_FLAGS); if (!fname) goto out_xfile; snprintf(fname, MAXNAMELEN - 1, "XFS (%s): %s", mp->m_super->s_id, description); fname[MAXNAMELEN - 1] = 0; xf->file = shmem_file_setup(fname, isize, 0); if (!xf->file) goto out_fname; if (IS_ERR(xf->file)) { error = PTR_ERR(xf->file); goto out_fname; } /* * We want a large sparse file that we can pread, pwrite, and seek. * xfile users are responsible for keeping the xfile hidden away from * all other callers, so we skip timestamp updates and security checks. */ xf->file->f_mode |= FMODE_PREAD | FMODE_PWRITE | FMODE_NOCMTIME | FMODE_LSEEK; xf->file->f_flags |= O_RDWR | O_LARGEFILE | O_NOATIME; xf->file->f_inode->i_flags |= S_PRIVATE | S_NOCMTIME | S_NOATIME; lockdep_set_class(&file_inode(xf->file)->i_rwsem, &xfile_i_mutex_key); trace_xfile_create(mp, xf); kfree(fname); *xfilep = xf; return 0; out_fname: kfree(fname); out_xfile: kfree(xf); return error; } /* Close the file and release all resources. */ void xfile_destroy( struct xfile *xf) { struct inode *inode = file_inode(xf->file); trace_xfile_destroy(xf); lockdep_set_class(&inode->i_rwsem, &inode->i_sb->s_type->i_mutex_key); fput(xf->file); kfree(xf); } /* * Read a memory object directly from the xfile's page cache. Unlike regular * pread, we return -E2BIG and -EFBIG for reads that are too large or at too * high an offset, instead of truncating the read. Otherwise, we return * bytes read or an error code, like regular pread. */ ssize_t xfile_pread( struct xfile *xf, void *buf, size_t count, loff_t pos) { struct inode *inode = file_inode(xf->file); struct address_space *mapping = inode->i_mapping; struct page *page = NULL; ssize_t read = 0; unsigned int pflags; int error = 0; if (count > MAX_RW_COUNT) return -E2BIG; if (inode->i_sb->s_maxbytes - pos < count) return -EFBIG; trace_xfile_pread(xf, pos, count); pflags = memalloc_nofs_save(); while (count > 0) { void *p, *kaddr; unsigned int len; len = min_t(ssize_t, count, PAGE_SIZE - offset_in_page(pos)); /* * In-kernel reads of a shmem file cause it to allocate a page * if the mapping shows a hole. Therefore, if we hit ENOMEM * we can continue by zeroing the caller's buffer. */ page = shmem_read_mapping_page_gfp(mapping, pos >> PAGE_SHIFT, __GFP_NOWARN); if (IS_ERR(page)) { error = PTR_ERR(page); if (error != -ENOMEM) break; page = NULL; } if (!page) { memset(buf, 0, len); goto advance; } if (PageHWPoison(page)) { put_page(page); error = -EIO; break; } if (PageUptodate(page)) { /* * xfile pages must never be mapped into userspace, so * we skip the dcache flush. */ kaddr = kmap_local_page(page); p = kaddr + offset_in_page(pos); memcpy(buf, p, len); kunmap_local(kaddr); } else { memset(buf, 0, len); } put_page(page); advance: count -= len; pos += len; buf += len; read += len; } memalloc_nofs_restore(pflags); if (read > 0) return read; return error; } /* * Write a memory object directly to the xfile's page cache. Unlike regular * pwrite, we return -E2BIG and -EFBIG for writes that are too large or at too * high an offset, instead of truncating the write. Otherwise, we return * bytes written or an error code, like regular pwrite. */ ssize_t xfile_pwrite( struct xfile *xf, const void *buf, size_t count, loff_t pos) { struct inode *inode = file_inode(xf->file); struct address_space *mapping = inode->i_mapping; const struct address_space_operations *aops = mapping->a_ops; struct page *page = NULL; ssize_t written = 0; unsigned int pflags; int error = 0; if (count > MAX_RW_COUNT) return -E2BIG; if (inode->i_sb->s_maxbytes - pos < count) return -EFBIG; trace_xfile_pwrite(xf, pos, count); pflags = memalloc_nofs_save(); while (count > 0) { void *fsdata; void *p, *kaddr; unsigned int len; int ret; len = min_t(ssize_t, count, PAGE_SIZE - offset_in_page(pos)); /* * We call write_begin directly here to avoid all the freezer * protection lock-taking that happens in the normal path. * shmem doesn't support fs freeze, but lockdep doesn't know * that and will trip over that. */ error = aops->write_begin(NULL, mapping, pos, len, &page, &fsdata); if (error) break; if (PageHWPoison(page)) { error = aops->write_end(NULL, mapping, pos, len, 0, page, fsdata); if (error >= 0) error = -EIO; break; } /* * xfile pages must never be mapped into userspace, so we skip * the dcache flush. If the page is not uptodate, zero it * before writing data. */ kaddr = kmap_local_page(page); if (!PageUptodate(page)) { memset(kaddr, 0, PAGE_SIZE); SetPageUptodate(page); } p = kaddr + offset_in_page(pos); memcpy(p, buf, len); kunmap_local(kaddr); ret = aops->write_end(NULL, mapping, pos, len, len, page, fsdata); if (ret < 0) { error = ret; break; } written += ret; if (ret != len) break; count -= ret; pos += ret; buf += ret; } memalloc_nofs_restore(pflags); if (written > 0) return written; return error; } /* Find the next written area in the xfile data for a given offset. */ loff_t xfile_seek_data( struct xfile *xf, loff_t pos) { loff_t ret; ret = vfs_llseek(xf->file, pos, SEEK_DATA); trace_xfile_seek_data(xf, pos, ret); return ret; } /* Query stat information for an xfile. */ int xfile_stat( struct xfile *xf, struct xfile_stat *statbuf) { struct kstat ks; int error; error = vfs_getattr_nosec(&xf->file->f_path, &ks, STATX_SIZE | STATX_BLOCKS, AT_STATX_DONT_SYNC); if (error) return error; statbuf->size = ks.size; statbuf->bytes = ks.blocks << SECTOR_SHIFT; return 0; } /* * Grab the (locked) page for a memory object. The object cannot span a page * boundary. Returns 0 (and a locked page) if successful, -ENOTBLK if we * cannot grab the page, or the usual negative errno. */ int xfile_obj_get_page( struct xfile *xf, loff_t pos, unsigned int len, struct page **pagep, void **fsdatap) { struct inode *inode = file_inode(xf->file); struct address_space *mapping = inode->i_mapping; const struct address_space_operations *aops = mapping->a_ops; struct page *page = NULL; void *fsdata = NULL; unsigned int pflags; int error; if (inode->i_sb->s_maxbytes - pos < len) return -ENOMEM; if (len > PAGE_SIZE - offset_in_page(pos)) return -ENOTBLK; trace_xfile_obj_get_page(xf, pos, len); pflags = memalloc_nofs_save(); /* * We call write_begin directly here to avoid all the freezer * protection lock-taking that happens in the normal path. shmem * doesn't support fs freeze, but lockdep doesn't know that and will * trip over that. */ error = aops->write_begin(NULL, mapping, pos, len, &page, &fsdata); if (error) goto out_pflags; /* Bail out if the memory page is poisoned. */ if (PageHWPoison(page)) { int ret; ret = aops->write_end(NULL, mapping, pos, len, 0, page, fsdata); if (ret < 0) error = ret; else error = -EIO; goto out_pflags; } /* We got the page, so make sure we push out EOF. */ if (i_size_read(inode) < pos + len) i_size_write(inode, pos + len); /* * If the page isn't up to date, fill it with zeroes before we hand it * to the caller and make sure the backing store will hold on to them. */ if (!PageUptodate(page)) { void *kaddr; kaddr = kmap_local_page(page); memset(kaddr, 0, PAGE_SIZE); kunmap_local(kaddr); SetPageUptodate(page); set_page_dirty(page); } *pagep = page; *fsdatap = fsdata; out_pflags: memalloc_nofs_restore(pflags); return error; } /* * Release the (locked) page for a memory object. The page must have been * obtained by xfile_obj_get_page. Returns 0 or a negative errno. */ int xfile_obj_put_page( struct xfile *xf, loff_t pos, unsigned int len, struct page *page, void *fsdata) { struct inode *inode = file_inode(xf->file); struct address_space *mapping = inode->i_mapping; const struct address_space_operations *aops = mapping->a_ops; unsigned int pflags; int ret; ASSERT(len <= PAGE_SIZE - offset_in_page(pos)); trace_xfile_obj_put_page(xf, pos, len); pflags = memalloc_nofs_save(); ret = aops->write_end(NULL, mapping, pos, len, len, page, fsdata); memalloc_nofs_restore(pflags); if (ret < 0) return ret; if (ret != len) return -EIO; return 0; } /* Dump an xfile to dmesg. */ int xfile_dump( struct xfile *xf) { struct xfile_stat sb; struct inode *inode = file_inode(xf->file); struct address_space *mapping = inode->i_mapping; loff_t holepos = 0; loff_t datapos; loff_t ret; unsigned int pflags; bool all_zeroes = true; int error = 0; error = xfile_stat(xf, &sb); if (error) return error; printk(KERN_ALERT "xfile ino 0x%lx isize 0x%llx dump:", inode->i_ino, sb.size); pflags = memalloc_nofs_save(); while ((ret = vfs_llseek(xf->file, holepos, SEEK_DATA)) >= 0) { datapos = rounddown_64(ret, PAGE_SIZE); ret = vfs_llseek(xf->file, datapos, SEEK_HOLE); if (ret < 0) break; holepos = min_t(loff_t, sb.size, roundup_64(ret, PAGE_SIZE)); while (datapos < holepos) { struct page *page = NULL; void *p, *kaddr; u64 datalen = holepos - datapos; unsigned int pagepos; unsigned int pagelen; cond_resched(); if (fatal_signal_pending(current)) { error = -EINTR; goto out_pflags; } pagelen = min_t(u64, datalen, PAGE_SIZE); page = shmem_read_mapping_page_gfp(mapping, datapos >> PAGE_SHIFT, __GFP_NOWARN); if (IS_ERR(page)) { error = PTR_ERR(page); if (error != -ENOMEM) goto out_pflags; goto next_pgoff; } if (!PageUptodate(page)) goto next_page; if (PageHWPoison(page)) { printk(KERN_ALERT "%.8llx: poisoned", datapos); goto next_page; } kaddr = kmap_local_page(page); p = kaddr; for (pagepos = 0; pagepos < pagelen; pagepos += 16) { char prefix[16]; unsigned int linelen; linelen = min_t(unsigned int, pagelen, 16); if (!memchr_inv(p + pagepos, 0, linelen)) continue; snprintf(prefix, 16, "%.8llx: ", datapos + pagepos); all_zeroes = false; print_hex_dump(KERN_ALERT, prefix, DUMP_PREFIX_NONE, 16, 1, p + pagepos, linelen, true); } kunmap_local(kaddr); next_page: put_page(page); next_pgoff: datapos += PAGE_SIZE; } } if (all_zeroes) printk(KERN_ALERT ""); if (ret != -ENXIO) error = ret; out_pflags: memalloc_nofs_restore(pflags); return error; }