summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/vfs_addr.c1
-rw-r--r--fs/Kconfig1
-rw-r--r--fs/Kconfig.binfmt4
-rw-r--r--fs/binfmt_elf.c6
-rw-r--r--fs/block_dev.c271
-rw-r--r--fs/btrfs/disk-io.c8
-rw-r--r--fs/btrfs/extent_io.c16
-rw-r--r--fs/btrfs/inode.c11
-rw-r--r--fs/btrfs/scrub.c2
-rw-r--r--fs/btrfs/super.c24
-rw-r--r--fs/btrfs/tests/btrfs-tests.c1
-rw-r--r--fs/btrfs/volumes.c4
-rw-r--r--fs/btrfs/volumes.h2
-rw-r--r--fs/buffer.c130
-rw-r--r--fs/ceph/dir.c24
-rw-r--r--fs/cifs/cifsencrypt.c11
-rw-r--r--fs/cifs/cifssmb.c4
-rw-r--r--fs/cifs/connect.c26
-rw-r--r--fs/cifs/transport.c1
-rw-r--r--fs/crypto/Kconfig2
-rw-r--r--fs/crypto/crypto.c123
-rw-r--r--fs/crypto/fname.c8
-rw-r--r--fs/crypto/fscrypt_private.h93
-rw-r--r--fs/crypto/keyinfo.c8
-rw-r--r--fs/crypto/policy.c36
-rw-r--r--fs/dax.c1320
-rw-r--r--fs/direct-io.c34
-rw-r--r--fs/dlm/ast.c2
-rw-r--r--fs/dlm/config.c2
-rw-r--r--fs/dlm/debug_fs.c2
-rw-r--r--fs/dlm/dlm_internal.h1
-rw-r--r--fs/dlm/lockspace.c2
-rw-r--r--fs/dlm/lowcomms.c28
-rw-r--r--fs/dlm/main.c2
-rw-r--r--fs/dlm/netlink.c18
-rw-r--r--fs/dlm/user.c1
-rw-r--r--fs/exec.c25
-rw-r--r--fs/ext2/file.c35
-rw-r--r--fs/ext2/inode.c20
-rw-r--r--fs/ext4/Kconfig1
-rw-r--r--fs/ext4/acl.c2
-rw-r--r--fs/ext4/ext4.h31
-rw-r--r--fs/ext4/ext4_jbd2.h14
-rw-r--r--fs/ext4/extents.c40
-rw-r--r--fs/ext4/file.c184
-rw-r--r--fs/ext4/ialloc.c5
-rw-r--r--fs/ext4/inline.c18
-rw-r--r--fs/ext4/inode.c367
-rw-r--r--fs/ext4/ioctl.c82
-rw-r--r--fs/ext4/mballoc.c4
-rw-r--r--fs/ext4/mmp.c6
-rw-r--r--fs/ext4/namei.c24
-rw-r--r--fs/ext4/page-io.c7
-rw-r--r--fs/ext4/super.c154
-rw-r--r--fs/ext4/xattr.c45
-rw-r--r--fs/f2fs/acl.c2
-rw-r--r--fs/f2fs/checkpoint.c38
-rw-r--r--fs/f2fs/data.c212
-rw-r--r--fs/f2fs/debug.c29
-rw-r--r--fs/f2fs/dir.c30
-rw-r--r--fs/f2fs/extent_cache.c2
-rw-r--r--fs/f2fs/f2fs.h203
-rw-r--r--fs/f2fs/file.c86
-rw-r--r--fs/f2fs/gc.c35
-rw-r--r--fs/f2fs/inline.c16
-rw-r--r--fs/f2fs/inode.c47
-rw-r--r--fs/f2fs/namei.c6
-rw-r--r--fs/f2fs/node.c230
-rw-r--r--fs/f2fs/node.h13
-rw-r--r--fs/f2fs/recovery.c46
-rw-r--r--fs/f2fs/segment.c240
-rw-r--r--fs/f2fs/segment.h28
-rw-r--r--fs/f2fs/shrinker.c10
-rw-r--r--fs/f2fs/super.c283
-rw-r--r--fs/f2fs/xattr.c4
-rw-r--r--fs/fs-writeback.c16
-rw-r--r--fs/fuse/dir.c7
-rw-r--r--fs/gfs2/dir.c1
-rw-r--r--fs/gfs2/log.c4
-rw-r--r--fs/gfs2/lops.c2
-rw-r--r--fs/gfs2/meta_io.c7
-rw-r--r--fs/gfs2/ops_fstype.c2
-rw-r--r--fs/hfsplus/super.c4
-rw-r--r--fs/internal.h3
-rw-r--r--fs/iomap.c378
-rw-r--r--fs/isofs/compress.c1
-rw-r--r--fs/isofs/rock.c4
-rw-r--r--fs/jbd2/checkpoint.c2
-rw-r--r--fs/jbd2/commit.c9
-rw-r--r--fs/jbd2/journal.c15
-rw-r--r--fs/jbd2/revoke.c2
-rw-r--r--fs/jfs/ioctl.c2
-rw-r--r--fs/jfs/jfs_logmgr.c4
-rw-r--r--fs/kernfs/inode.c4
-rw-r--r--fs/lockd/netns.h2
-rw-r--r--fs/lockd/svc.c2
-rw-r--r--fs/mbcache.c41
-rw-r--r--fs/mpage.c9
-rw-r--r--fs/nfs/callback_proc.c99
-rw-r--r--fs/nfs/client.c6
-rw-r--r--fs/nfs/delegation.c4
-rw-r--r--fs/nfs/dir.c73
-rw-r--r--fs/nfs/direct.c12
-rw-r--r--fs/nfs/file.c5
-rw-r--r--fs/nfs/filelayout/filelayoutdev.c3
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c313
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.h23
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayoutdev.c151
-rw-r--r--fs/nfs/inode.c53
-rw-r--r--fs/nfs/internal.h9
-rw-r--r--fs/nfs/netns.h2
-rw-r--r--fs/nfs/nfs3client.c5
-rw-r--r--fs/nfs/nfs42proc.c9
-rw-r--r--fs/nfs/nfs42xdr.c5
-rw-r--r--fs/nfs/nfs4_fs.h2
-rw-r--r--fs/nfs/nfs4client.c32
-rw-r--r--fs/nfs/nfs4file.c2
-rw-r--r--fs/nfs/nfs4proc.c233
-rw-r--r--fs/nfs/nfs4session.c2
-rw-r--r--fs/nfs/nfs4state.c27
-rw-r--r--fs/nfs/nfs4xdr.c96
-rw-r--r--fs/nfs/objlayout/objlayout.c4
-rw-r--r--fs/nfs/objlayout/objlayout.h1
-rw-r--r--fs/nfs/pagelist.c3
-rw-r--r--fs/nfs/pnfs.c407
-rw-r--r--fs/nfs/pnfs.h77
-rw-r--r--fs/nfs/pnfs_nfs.c28
-rw-r--r--fs/nfs/super.c2
-rw-r--r--fs/nfs/write.c3
-rw-r--r--fs/nfs_common/grace.c2
-rw-r--r--fs/nfsd/netns.h2
-rw-r--r--fs/nfsd/nfsctl.c2
-rw-r--r--fs/nilfs2/super.c2
-rw-r--r--fs/nsfs.c2
-rw-r--r--fs/ntfs/aops.c3
-rw-r--r--fs/ntfs/file.c5
-rw-r--r--fs/ntfs/logfile.c1
-rw-r--r--fs/ntfs/mft.c1
-rw-r--r--fs/ocfs2/aops.c9
-rw-r--r--fs/ocfs2/aops.h3
-rw-r--r--fs/ocfs2/buffer_head_io.c1
-rw-r--r--fs/ocfs2/cluster/heartbeat.c4
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c11
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c2
-rw-r--r--fs/ocfs2/inode.c2
-rw-r--r--fs/ocfs2/journal.c4
-rw-r--r--fs/ocfs2/mmap.c3
-rw-r--r--fs/ocfs2/namei.c6
-rw-r--r--fs/ocfs2/ocfs2.h2
-rw-r--r--fs/ocfs2/refcounttree.c1
-rw-r--r--fs/ocfs2/super.c2
-rw-r--r--fs/orangefs/inode.c1
-rw-r--r--fs/orangefs/orangefs-debugfs.c4
-rw-r--r--fs/orangefs/orangefs-sysfs.c9
-rw-r--r--fs/overlayfs/super.c6
-rw-r--r--fs/proc/array.c7
-rw-r--r--fs/proc/base.c56
-rw-r--r--fs/proc/fd.c6
-rw-r--r--fs/proc/generic.c1
-rw-r--r--fs/proc/inode.c37
-rw-r--r--fs/proc/internal.h6
-rw-r--r--fs/proc/namespaces.c3
-rw-r--r--fs/proc/root.c1
-rw-r--r--fs/proc/task_mmu.c1
-rw-r--r--fs/pstore/Kconfig2
-rw-r--r--fs/pstore/ftrace.c11
-rw-r--r--fs/pstore/inode.c15
-rw-r--r--fs/pstore/internal.h34
-rw-r--r--fs/pstore/platform.c5
-rw-r--r--fs/pstore/ram.c327
-rw-r--r--fs/pstore/ram_core.c27
-rw-r--r--fs/quota/netlink.c10
-rw-r--r--fs/reiserfs/inode.c1
-rw-r--r--fs/reiserfs/journal.c6
-rw-r--r--fs/reiserfs/stree.c1
-rw-r--r--fs/splice.c1
-rw-r--r--fs/squashfs/block.c1
-rw-r--r--fs/ubifs/Kconfig11
-rw-r--r--fs/ubifs/Makefile1
-rw-r--r--fs/ubifs/crypto.c97
-rw-r--r--fs/ubifs/debug.c14
-rw-r--r--fs/ubifs/dir.c478
-rw-r--r--fs/ubifs/file.c108
-rw-r--r--fs/ubifs/gc.c4
-rw-r--r--fs/ubifs/io.c18
-rw-r--r--fs/ubifs/ioctl.c20
-rw-r--r--fs/ubifs/journal.c224
-rw-r--r--fs/ubifs/key.h21
-rw-r--r--fs/ubifs/replay.c10
-rw-r--r--fs/ubifs/sb.c59
-rw-r--r--fs/ubifs/super.c17
-rw-r--r--fs/ubifs/tnc.c159
-rw-r--r--fs/ubifs/ubifs-media.h29
-rw-r--r--fs/ubifs/ubifs.h115
-rw-r--r--fs/ubifs/xattr.c116
-rw-r--r--fs/udf/dir.c1
-rw-r--r--fs/udf/directory.c1
-rw-r--r--fs/udf/inode.c1
-rw-r--r--fs/ufs/balloc.c4
-rw-r--r--fs/ufs/inode.c3
-rw-r--r--fs/userfaultfd.c22
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c10
-rw-r--r--fs/xfs/libxfs/xfs_alloc_btree.c6
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.c8
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.h4
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c336
-rw-r--r--fs/xfs/libxfs/xfs_bmap.h9
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.c3
-rw-r--r--fs/xfs/libxfs/xfs_btree.c20
-rw-r--r--fs/xfs/libxfs/xfs_btree.h43
-rw-r--r--fs/xfs/libxfs/xfs_cksum.h26
-rw-r--r--fs/xfs/libxfs/xfs_dir2.c2
-rw-r--r--fs/xfs/libxfs/xfs_dir2.h5
-rw-r--r--fs/xfs/libxfs/xfs_dir2_data.c26
-rw-r--r--fs/xfs/libxfs/xfs_dir2_priv.h1
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c18
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.c4
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.c16
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.h4
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.c77
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.h7
-rw-r--r--fs/xfs/libxfs/xfs_log_format.h4
-rw-r--r--fs/xfs/libxfs/xfs_log_recover.h2
-rw-r--r--fs/xfs/libxfs/xfs_refcount_btree.c1
-rw-r--r--fs/xfs/libxfs/xfs_rmap_btree.c1
-rw-r--r--fs/xfs/libxfs/xfs_rtbitmap.c1
-rw-r--r--fs/xfs/libxfs/xfs_sb.c13
-rw-r--r--fs/xfs/libxfs/xfs_types.h4
-rw-r--r--fs/xfs/xfs_aops.c309
-rw-r--r--fs/xfs/xfs_aops.h9
-rw-r--r--fs/xfs/xfs_attr.h4
-rw-r--r--fs/xfs/xfs_attr_list.c59
-rw-r--r--fs/xfs/xfs_bmap_util.c45
-rw-r--r--fs/xfs/xfs_buf.c125
-rw-r--r--fs/xfs/xfs_buf.h3
-rw-r--r--fs/xfs/xfs_dir2_readdir.c2
-rw-r--r--fs/xfs/xfs_file.c274
-rw-r--r--fs/xfs/xfs_icache.c40
-rw-r--r--fs/xfs/xfs_icreate_item.c2
-rw-r--r--fs/xfs/xfs_inode.c84
-rw-r--r--fs/xfs/xfs_inode.h18
-rw-r--r--fs/xfs/xfs_inode_item.c4
-rw-r--r--fs/xfs/xfs_ioctl.c8
-rw-r--r--fs/xfs/xfs_iomap.c104
-rw-r--r--fs/xfs/xfs_iops.c14
-rw-r--r--fs/xfs/xfs_linux.h1
-rw-r--r--fs/xfs/xfs_log.c41
-rw-r--r--fs/xfs/xfs_log_recover.c16
-rw-r--r--fs/xfs/xfs_mount.c7
-rw-r--r--fs/xfs/xfs_mount.h7
-rw-r--r--fs/xfs/xfs_pnfs.c7
-rw-r--r--fs/xfs/xfs_pnfs.h4
-rw-r--r--fs/xfs/xfs_qm.c2
-rw-r--r--fs/xfs/xfs_reflink.c191
-rw-r--r--fs/xfs/xfs_reflink.h6
-rw-r--r--fs/xfs/xfs_stats.c10
-rw-r--r--fs/xfs/xfs_stats.h200
-rw-r--r--fs/xfs/xfs_super.c27
-rw-r--r--fs/xfs/xfs_symlink.c7
-rw-r--r--fs/xfs/xfs_trace.h109
-rw-r--r--fs/xfs/xfs_xattr.c23
261 files changed, 7086 insertions, 4574 deletions
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index 6181ad79e1a5..5ca1fb0043f6 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -34,6 +34,7 @@
#include <linux/idr.h>
#include <linux/sched.h>
#include <linux/uio.h>
+#include <linux/bvec.h>
#include <net/9p/9p.h>
#include <net/9p/client.h>
diff --git a/fs/Kconfig b/fs/Kconfig
index 884653fc6a8b..c2a377cdda2b 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -55,7 +55,6 @@ config FS_DAX_PMD
depends on FS_DAX
depends on ZONE_DEVICE
depends on TRANSPARENT_HUGEPAGE
- depends on BROKEN
endif # BLOCK
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index 4c09d93d9569..b2f82cf6bf86 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -170,8 +170,8 @@ config BINFMT_MISC
You can do other nice things, too. Read the file
<file:Documentation/binfmt_misc.txt> to learn how to use this
- feature, <file:Documentation/java.txt> for information about how
- to include Java support. and <file:Documentation/mono.txt> for
+ feature, <file:Documentation/admin-guide/java.rst> for information about how
+ to include Java support. and <file:Documentation/admin-guide/mono.rst> for
information about how to include Mono-based .NET support.
To use binfmt_misc, you will need to mount it:
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 2472af2798c7..e6c1bd443806 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -2204,7 +2204,9 @@ static int elf_core_dump(struct coredump_params *cprm)
dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE);
- vma_filesz = kmalloc_array(segs - 1, sizeof(*vma_filesz), GFP_KERNEL);
+ if (segs - 1 > ULONG_MAX / sizeof(*vma_filesz))
+ goto end_coredump;
+ vma_filesz = vmalloc((segs - 1) * sizeof(*vma_filesz));
if (!vma_filesz)
goto end_coredump;
@@ -2311,7 +2313,7 @@ end_coredump:
cleanup:
free_note_info(&info);
kfree(shdr4extnum);
- kfree(vma_filesz);
+ vfree(vma_filesz);
kfree(phdr4note);
kfree(elf);
out:
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 05b553368bb4..7c4507224ed6 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -30,6 +30,7 @@
#include <linux/cleancache.h>
#include <linux/dax.h>
#include <linux/badblocks.h>
+#include <linux/task_io_accounting_ops.h>
#include <linux/falloc.h>
#include <asm/uaccess.h>
#include "internal.h"
@@ -175,16 +176,269 @@ static struct inode *bdev_file_inode(struct file *file)
return file->f_mapping->host;
}
+static unsigned int dio_bio_write_op(struct kiocb *iocb)
+{
+ unsigned int op = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE;
+
+ /* avoid the need for a I/O completion work item */
+ if (iocb->ki_flags & IOCB_DSYNC)
+ op |= REQ_FUA;
+ return op;
+}
+
+#define DIO_INLINE_BIO_VECS 4
+
+static void blkdev_bio_end_io_simple(struct bio *bio)
+{
+ struct task_struct *waiter = bio->bi_private;
+
+ WRITE_ONCE(bio->bi_private, NULL);
+ wake_up_process(waiter);
+}
+
static ssize_t
-blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
+__blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
+ int nr_pages)
+{
+ struct file *file = iocb->ki_filp;
+ struct block_device *bdev = I_BDEV(bdev_file_inode(file));
+ struct bio_vec inline_vecs[DIO_INLINE_BIO_VECS], *vecs, *bvec;
+ loff_t pos = iocb->ki_pos;
+ bool should_dirty = false;
+ struct bio bio;
+ ssize_t ret;
+ blk_qc_t qc;
+ int i;
+
+ if ((pos | iov_iter_alignment(iter)) &
+ (bdev_logical_block_size(bdev) - 1))
+ return -EINVAL;
+
+ if (nr_pages <= DIO_INLINE_BIO_VECS)
+ vecs = inline_vecs;
+ else {
+ vecs = kmalloc(nr_pages * sizeof(struct bio_vec), GFP_KERNEL);
+ if (!vecs)
+ return -ENOMEM;
+ }
+
+ bio_init(&bio, vecs, nr_pages);
+ bio.bi_bdev = bdev;
+ bio.bi_iter.bi_sector = pos >> 9;
+ bio.bi_private = current;
+ bio.bi_end_io = blkdev_bio_end_io_simple;
+
+ ret = bio_iov_iter_get_pages(&bio, iter);
+ if (unlikely(ret))
+ return ret;
+ ret = bio.bi_iter.bi_size;
+
+ if (iov_iter_rw(iter) == READ) {
+ bio.bi_opf = REQ_OP_READ;
+ if (iter_is_iovec(iter))
+ should_dirty = true;
+ } else {
+ bio.bi_opf = dio_bio_write_op(iocb);
+ task_io_account_write(ret);
+ }
+
+ qc = submit_bio(&bio);
+ for (;;) {
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ if (!READ_ONCE(bio.bi_private))
+ break;
+ if (!(iocb->ki_flags & IOCB_HIPRI) ||
+ !blk_mq_poll(bdev_get_queue(bdev), qc))
+ io_schedule();
+ }
+ __set_current_state(TASK_RUNNING);
+
+ bio_for_each_segment_all(bvec, &bio, i) {
+ if (should_dirty && !PageCompound(bvec->bv_page))
+ set_page_dirty_lock(bvec->bv_page);
+ put_page(bvec->bv_page);
+ }
+
+ if (vecs != inline_vecs)
+ kfree(vecs);
+
+ if (unlikely(bio.bi_error))
+ return bio.bi_error;
+ return ret;
+}
+
+struct blkdev_dio {
+ union {
+ struct kiocb *iocb;
+ struct task_struct *waiter;
+ };
+ size_t size;
+ atomic_t ref;
+ bool multi_bio : 1;
+ bool should_dirty : 1;
+ bool is_sync : 1;
+ struct bio bio;
+};
+
+static struct bio_set *blkdev_dio_pool __read_mostly;
+
+static void blkdev_bio_end_io(struct bio *bio)
+{
+ struct blkdev_dio *dio = bio->bi_private;
+ bool should_dirty = dio->should_dirty;
+
+ if (dio->multi_bio && !atomic_dec_and_test(&dio->ref)) {
+ if (bio->bi_error && !dio->bio.bi_error)
+ dio->bio.bi_error = bio->bi_error;
+ } else {
+ if (!dio->is_sync) {
+ struct kiocb *iocb = dio->iocb;
+ ssize_t ret = dio->bio.bi_error;
+
+ if (likely(!ret)) {
+ ret = dio->size;
+ iocb->ki_pos += ret;
+ }
+
+ dio->iocb->ki_complete(iocb, ret, 0);
+ bio_put(&dio->bio);
+ } else {
+ struct task_struct *waiter = dio->waiter;
+
+ WRITE_ONCE(dio->waiter, NULL);
+ wake_up_process(waiter);
+ }
+ }
+
+ if (should_dirty) {
+ bio_check_pages_dirty(bio);
+ } else {
+ struct bio_vec *bvec;
+ int i;
+
+ bio_for_each_segment_all(bvec, bio, i)
+ put_page(bvec->bv_page);
+ bio_put(bio);
+ }
+}
+
+static ssize_t
+__blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
{
struct file *file = iocb->ki_filp;
struct inode *inode = bdev_file_inode(file);
+ struct block_device *bdev = I_BDEV(inode);
+ struct blkdev_dio *dio;
+ struct bio *bio;
+ bool is_read = (iov_iter_rw(iter) == READ);
+ loff_t pos = iocb->ki_pos;
+ blk_qc_t qc = BLK_QC_T_NONE;
+ int ret;
+
+ if ((pos | iov_iter_alignment(iter)) &
+ (bdev_logical_block_size(bdev) - 1))
+ return -EINVAL;
+
+ bio = bio_alloc_bioset(GFP_KERNEL, nr_pages, blkdev_dio_pool);
+ bio_get(bio); /* extra ref for the completion handler */
+
+ dio = container_of(bio, struct blkdev_dio, bio);
+ dio->is_sync = is_sync_kiocb(iocb);
+ if (dio->is_sync)
+ dio->waiter = current;
+ else
+ dio->iocb = iocb;
+
+ dio->size = 0;
+ dio->multi_bio = false;
+ dio->should_dirty = is_read && (iter->type == ITER_IOVEC);
+
+ for (;;) {
+ bio->bi_bdev = bdev;
+ bio->bi_iter.bi_sector = pos >> 9;
+ bio->bi_private = dio;
+ bio->bi_end_io = blkdev_bio_end_io;
+
+ ret = bio_iov_iter_get_pages(bio, iter);
+ if (unlikely(ret)) {
+ bio->bi_error = ret;
+ bio_endio(bio);
+ break;
+ }
+
+ if (is_read) {
+ bio->bi_opf = REQ_OP_READ;
+ if (dio->should_dirty)
+ bio_set_pages_dirty(bio);
+ } else {
+ bio->bi_opf = dio_bio_write_op(iocb);
+ task_io_account_write(bio->bi_iter.bi_size);
+ }
+
+ dio->size += bio->bi_iter.bi_size;
+ pos += bio->bi_iter.bi_size;
+
+ nr_pages = iov_iter_npages(iter, BIO_MAX_PAGES);
+ if (!nr_pages) {
+ qc = submit_bio(bio);
+ break;
+ }
+
+ if (!dio->multi_bio) {
+ dio->multi_bio = true;
+ atomic_set(&dio->ref, 2);
+ } else {
+ atomic_inc(&dio->ref);
+ }
- return __blockdev_direct_IO(iocb, inode, I_BDEV(inode), iter,
- blkdev_get_block, NULL, NULL,
- DIO_SKIP_DIO_COUNT);
+ submit_bio(bio);
+ bio = bio_alloc(GFP_KERNEL, nr_pages);
+ }
+
+ if (!dio->is_sync)
+ return -EIOCBQUEUED;
+
+ for (;;) {
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ if (!READ_ONCE(dio->waiter))
+ break;
+
+ if (!(iocb->ki_flags & IOCB_HIPRI) ||
+ !blk_mq_poll(bdev_get_queue(bdev), qc))
+ io_schedule();
+ }
+ __set_current_state(TASK_RUNNING);
+
+ ret = dio->bio.bi_error;
+ if (likely(!ret))
+ ret = dio->size;
+
+ bio_put(&dio->bio);
+ return ret;
+}
+
+static ssize_t
+blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
+{
+ int nr_pages;
+
+ nr_pages = iov_iter_npages(iter, BIO_MAX_PAGES + 1);
+ if (!nr_pages)
+ return 0;
+ if (is_sync_kiocb(iocb) && nr_pages <= BIO_MAX_PAGES)
+ return __blkdev_direct_IO_simple(iocb, iter, nr_pages);
+
+ return __blkdev_direct_IO(iocb, iter, min(nr_pages, BIO_MAX_PAGES));
+}
+
+static __init int blkdev_init(void)
+{
+ blkdev_dio_pool = bioset_create(4, offsetof(struct blkdev_dio, bio));
+ if (!blkdev_dio_pool)
+ return -ENOMEM;
+ return 0;
}
+module_init(blkdev_init);
int __sync_blockdev(struct block_device *bdev, int wait)
{
@@ -832,7 +1086,7 @@ static bool bd_may_claim(struct block_device *bdev, struct block_device *whole,
return true; /* already a holder */
else if (bdev->bd_holder != NULL)
return false; /* held by someone else */
- else if (bdev->bd_contains == bdev)
+ else if (whole == bdev)
return true; /* is a whole device which isn't held */
else if (whole->bd_holder == bd_may_claim)
@@ -1950,6 +2204,7 @@ void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg)
spin_lock(&blockdev_superblock->s_inode_list_lock);
list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) {
struct address_space *mapping = inode->i_mapping;
+ struct block_device *bdev;
spin_lock(&inode->i_lock);
if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW) ||
@@ -1970,8 +2225,12 @@ void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg)
*/
iput(old_inode);
old_inode = inode;
+ bdev = I_BDEV(inode);
- func(I_BDEV(inode), arg);
+ mutex_lock(&bdev->bd_mutex);
+ if (bdev->bd_openers)
+ func(bdev, arg);
+ mutex_unlock(&bdev->bd_mutex);
spin_lock(&blockdev_superblock->s_inode_list_lock);
}
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 3a57f99d96aa..fe10afd51e02 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -930,7 +930,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
atomic_inc(&fs_info->nr_async_submits);
- if (bio->bi_opf & REQ_SYNC)
+ if (op_is_sync(bio->bi_opf))
btrfs_set_work_high_priority(&async->work);
btrfs_queue_work(fs_info->workers, &async->work);
@@ -3485,9 +3485,9 @@ static int write_dev_supers(struct btrfs_device *device,
* to go down lazy.
*/
if (i == 0)
- ret = btrfsic_submit_bh(REQ_OP_WRITE, WRITE_FUA, bh);
+ ret = btrfsic_submit_bh(REQ_OP_WRITE, REQ_FUA, bh);
else
- ret = btrfsic_submit_bh(REQ_OP_WRITE, WRITE_SYNC, bh);
+ ret = btrfsic_submit_bh(REQ_OP_WRITE, REQ_SYNC, bh);
if (ret)
errors++;
}
@@ -3551,7 +3551,7 @@ static int write_dev_flush(struct btrfs_device *device, int wait)
bio->bi_end_io = btrfs_end_empty_barrier;
bio->bi_bdev = device->bdev;
- bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH);
+ bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
init_completion(&device->flush_wait);
bio->bi_private = &device->flush_wait;
device->flush_bio = bio;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 8ed05d95584a..1e67723c27a1 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -127,7 +127,7 @@ struct extent_page_data {
*/
unsigned int extent_locked:1;
- /* tells the submit_bio code to use a WRITE_SYNC */
+ /* tells the submit_bio code to use REQ_SYNC */
unsigned int sync_io:1;
};
@@ -2047,7 +2047,7 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical,
return -EIO;
}
bio->bi_bdev = dev->bdev;
- bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_SYNC);
+ bio->bi_opf = REQ_OP_WRITE | REQ_SYNC;
bio_add_page(bio, page, length, pg_offset);
if (btrfsic_submit_bio_wait(bio)) {
@@ -2388,7 +2388,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
struct inode *inode = page->mapping->host;
struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
struct bio *bio;
- int read_mode;
+ int read_mode = 0;
int ret;
BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
@@ -2404,9 +2404,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
}
if (failed_bio->bi_vcnt > 1)
- read_mode = READ_SYNC | REQ_FAILFAST_DEV;
- else
- read_mode = READ_SYNC;
+ read_mode |= REQ_FAILFAST_DEV;
phy_offset >>= inode->i_sb->s_blocksize_bits;
bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page,
@@ -3484,7 +3482,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
unsigned long nr_written = 0;
if (wbc->sync_mode == WB_SYNC_ALL)
- write_flags = WRITE_SYNC;
+ write_flags = REQ_SYNC;
trace___extent_writepage(page, inode, wbc);
@@ -3729,7 +3727,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
unsigned long i, num_pages;
unsigned long bio_flags = 0;
unsigned long start, end;
- int write_flags = (epd->sync_io ? WRITE_SYNC : 0) | REQ_META;
+ int write_flags = (epd->sync_io ? REQ_SYNC : 0) | REQ_META;
int ret = 0;
clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags);
@@ -4076,7 +4074,7 @@ static void flush_epd_write_bio(struct extent_page_data *epd)
int ret;
bio_set_op_attrs(epd->bio, REQ_OP_WRITE,
- epd->sync_io ? WRITE_SYNC : 0);
+ epd->sync_io ? REQ_SYNC : 0);
ret = submit_one_bio(epd->bio, 0, epd->bio_flags);
BUG_ON(ret < 0); /* -ENOMEM */
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8e3a5a266917..a4c879671b9d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7917,7 +7917,7 @@ static int dio_read_error(struct inode *inode, struct bio *failed_bio,
struct io_failure_record *failrec;
struct bio *bio;
int isector;
- int read_mode;
+ int read_mode = 0;
int ret;
BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
@@ -7936,9 +7936,7 @@ static int dio_read_error(struct inode *inode, struct bio *failed_bio,
if ((failed_bio->bi_vcnt > 1)
|| (failed_bio->bi_io_vec->bv_len
> BTRFS_I(inode)->root->sectorsize))
- read_mode = READ_SYNC | REQ_FAILFAST_DEV;
- else
- read_mode = READ_SYNC;
+ read_mode |= REQ_FAILFAST_DEV;
isector = start - btrfs_io_bio(failed_bio)->logical;
isector >>= inode->i_sb->s_blocksize_bits;
@@ -8427,7 +8425,7 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip,
if (!bio)
return -ENOMEM;
- bio_set_op_attrs(bio, bio_op(orig_bio), bio_flags(orig_bio));
+ bio->bi_opf = orig_bio->bi_opf;
bio->bi_private = dip;
bio->bi_end_io = btrfs_end_dio_bio;
btrfs_io_bio(bio)->logical = file_offset;
@@ -8465,8 +8463,7 @@ next_block:
start_sector, GFP_NOFS);
if (!bio)
goto out_err;
- bio_set_op_attrs(bio, bio_op(orig_bio),
- bio_flags(orig_bio));
+ bio->bi_opf = orig_bio->bi_opf;
bio->bi_private = dip;
bio->bi_end_io = btrfs_end_dio_bio;
btrfs_io_bio(bio)->logical = file_offset;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index fffb9ab8526e..ff3078234d94 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -4440,7 +4440,7 @@ static int write_page_nocow(struct scrub_ctx *sctx,
bio->bi_iter.bi_size = 0;
bio->bi_iter.bi_sector = physical_for_dev_replace >> 9;
bio->bi_bdev = dev->bdev;
- bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_SYNC);
+ bio->bi_opf = REQ_OP_WRITE | REQ_SYNC;
ret = bio_add_page(bio, page, PAGE_SIZE, 0);
if (ret != PAGE_SIZE) {
leave_with_eio:
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 74ed5aae6cea..3b713b6fcc26 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -202,27 +202,25 @@ static struct ratelimit_state printk_limits[] = {
void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
{
struct super_block *sb = fs_info->sb;
- char lvl[4];
+ char lvl[PRINTK_MAX_SINGLE_HEADER_LEN + 1] = "\0";
struct va_format vaf;
va_list args;
- const char *type = logtypes[4];
int kern_level;
- struct ratelimit_state *ratelimit;
+ const char *type = logtypes[4];
+ struct ratelimit_state *ratelimit = &printk_limits[4];
va_start(args, fmt);
- kern_level = printk_get_level(fmt);
- if (kern_level) {
+ while ((kern_level = printk_get_level(fmt)) != 0) {
size_t size = printk_skip_level(fmt) - fmt;
- memcpy(lvl, fmt, size);
- lvl[size] = '\0';
+
+ if (kern_level >= '0' && kern_level <= '7') {
+ memcpy(lvl, fmt, size);
+ lvl[size] = '\0';
+ type = logtypes[kern_level - '0'];
+ ratelimit = &printk_limits[kern_level - '0'];
+ }
fmt += size;
- type = logtypes[kern_level - '0'];
- ratelimit = &printk_limits[kern_level - '0'];
- } else {
- *lvl = '\0';
- /* Default to debug output */
- ratelimit = &printk_limits[7];
}
vaf.fmt = fmt;
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index bf62ad919a95..00ee006a8aa2 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -162,6 +162,7 @@ void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info)
slot = radix_tree_iter_retry(&iter);
continue;
}
+ slot = radix_tree_iter_resume(slot, &iter);
spin_unlock(&fs_info->buffer_lock);
free_extent_buffer_stale(eb);
spin_lock(&fs_info->buffer_lock);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 71a60cc01451..0d7d635d8bfb 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -6023,7 +6023,7 @@ static void btrfs_end_bio(struct bio *bio)
else
btrfs_dev_stat_inc(dev,
BTRFS_DEV_STAT_READ_ERRS);
- if ((bio->bi_opf & WRITE_FLUSH) == WRITE_FLUSH)
+ if (bio->bi_opf & REQ_PREFLUSH)
btrfs_dev_stat_inc(dev,
BTRFS_DEV_STAT_FLUSH_ERRS);
btrfs_dev_stat_print_on_error(dev);
@@ -6100,7 +6100,7 @@ static noinline void btrfs_schedule_bio(struct btrfs_root *root,
bio->bi_next = NULL;
spin_lock(&device->io_lock);
- if (bio->bi_opf & REQ_SYNC)
+ if (op_is_sync(bio->bi_opf))
pending_bios = &device->pending_sync_bios;
else
pending_bios = &device->pending_bios;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 09ed29c67848..f137ffe6654c 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -62,7 +62,7 @@ struct btrfs_device {
int running_pending;
/* regular prio bios */
struct btrfs_pending_bios pending_bios;
- /* WRITE_SYNC bios */
+ /* sync bios */
struct btrfs_pending_bios pending_sync_bios;
struct block_device *bdev;
diff --git a/fs/buffer.c b/fs/buffer.c
index b205a629001d..d21771fcf7d3 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -43,6 +43,7 @@
#include <linux/bitops.h>
#include <linux/mpage.h>
#include <linux/bit_spinlock.h>
+#include <linux/pagevec.h>
#include <trace/events/block.h>
static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
@@ -753,7 +754,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
* still in flight on potentially older
* contents.
*/
- write_dirty_buffer(bh, WRITE_SYNC);
+ write_dirty_buffer(bh, REQ_SYNC);
/*
* Kick off IO for the previous mapping. Note
@@ -1604,37 +1605,80 @@ void create_empty_buffers(struct page *page,
}
EXPORT_SYMBOL(create_empty_buffers);
-/*
- * We are taking a block for data and we don't want any output from any
- * buffer-cache aliases starting from return from that function and
- * until the moment when something will explicitly mark the buffer
- * dirty (hopefully that will not happen until we will free that block ;-)
- * We don't even need to mark it not-uptodate - nobody can expect
- * anything from a newly allocated buffer anyway. We used to used
- * unmap_buffer() for such invalidation, but that was wrong. We definitely
- * don't want to mark the alias unmapped, for example - it would confuse
- * anyone who might pick it with bread() afterwards...
- *
- * Also.. Note that bforget() doesn't lock the buffer. So there can
- * be writeout I/O going on against recently-freed buffers. We don't
- * wait on that I/O in bforget() - it's more efficient to wait on the I/O
- * only if we really need to. That happens here.
- */
-void unmap_underlying_metadata(struct block_device *bdev, sector_t block)
+/**
+ * clean_bdev_aliases: clean a range of buffers in block device
+ * @bdev: Block device to clean buffers in
+ * @block: Start of a range of blocks to clean
+ * @len: Number of blocks to clean
+ *
+ * We are taking a range of blocks for data and we don't want writeback of any
+ * buffer-cache aliases starting from return from this function and until the
+ * moment when something will explicitly mark the buffer dirty (hopefully that
+ * will not happen until we will free that block ;-) We don't even need to mark
+ * it not-uptodate - nobody can expect anything from a newly allocated buffer
+ * anyway. We used to use unmap_buffer() for such invalidation, but that was
+ * wrong. We definitely don't want to mark the alias unmapped, for example - it
+ * would confuse anyone who might pick it with bread() afterwards...
+ *
+ * Also.. Note that bforget() doesn't lock the buffer. So there can be
+ * writeout I/O going on against recently-freed buffers. We don't wait on that
+ * I/O in bforget() - it's more efficient to wait on the I/O only if we really
+ * need to. That happens here.
+ */
+void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len)
{
- struct buffer_head *old_bh;
+ struct inode *bd_inode = bdev->bd_inode;
+ struct address_space *bd_mapping = bd_inode->i_mapping;
+ struct pagevec pvec;
+ pgoff_t index = block >> (PAGE_SHIFT - bd_inode->i_blkbits);
+ pgoff_t end;
+ int i;
+ struct buffer_head *bh;
+ struct buffer_head *head;
- might_sleep();
+ end = (block + len - 1) >> (PAGE_SHIFT - bd_inode->i_blkbits);
+ pagevec_init(&pvec, 0);
+ while (index <= end && pagevec_lookup(&pvec, bd_mapping, index,
+ min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
+ for (i = 0; i < pagevec_count(&pvec); i++) {
+ struct page *page = pvec.pages[i];
- old_bh = __find_get_block_slow(bdev, block);
- if (old_bh) {
- clear_buffer_dirty(old_bh);
- wait_on_buffer(old_bh);
- clear_buffer_req(old_bh);
- __brelse(old_bh);
+ index = page->index;
+ if (index > end)
+ break;
+ if (!page_has_buffers(page))
+ continue;
+ /*
+ * We use page lock instead of bd_mapping->private_lock
+ * to pin buffers here since we can afford to sleep and
+ * it scales better than a global spinlock lock.
+ */
+ lock_page(page);
+ /* Recheck when the page is locked which pins bhs */
+ if (!page_has_buffers(page))
+ goto unlock_page;
+ head = page_buffers(page);
+ bh = head;
+ do {
+ if (!buffer_mapped(bh))
+ goto next;
+ if (bh->b_blocknr >= block + len)
+ break;
+ clear_buffer_dirty(bh);
+ wait_on_buffer(bh);
+ clear_buffer_req(bh);
+next:
+ bh = bh->b_this_page;
+ } while (bh != head);
+unlock_page:
+ unlock_page(page);
+ }
+ pagevec_release(&pvec);
+ cond_resched();
+ index++;
}
}
-EXPORT_SYMBOL(unmap_underlying_metadata);
+EXPORT_SYMBOL(clean_bdev_aliases);
/*
* Size is a power-of-two in the range 512..PAGE_SIZE,
@@ -1684,7 +1728,7 @@ static struct buffer_head *create_page_buffers(struct page *page, struct inode *
* prevents this contention from occurring.
*
* If block_write_full_page() is called with wbc->sync_mode ==
- * WB_SYNC_ALL, the writes are posted using WRITE_SYNC; this
+ * WB_SYNC_ALL, the writes are posted using REQ_SYNC; this
* causes the writes to be flagged as synchronous writes.
*/
int __block_write_full_page(struct inode *inode, struct page *page,
@@ -1697,7 +1741,7 @@ int __block_write_full_page(struct inode *inode, struct page *page,
struct buffer_head *bh, *head;
unsigned int blocksize, bbits;
int nr_underway = 0;
- int write_flags = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : 0);
+ int write_flags = wbc_to_write_flags(wbc);
head = create_page_buffers(page, inode,
(1 << BH_Dirty)|(1 << BH_Uptodate));
@@ -1745,8 +1789,7 @@ int __block_write_full_page(struct inode *inode, struct page *page,
if (buffer_new(bh)) {
/* blockdev mappings never come here */
clear_buffer_new(bh);
- unmap_underlying_metadata(bh->b_bdev,
- bh->b_blocknr);
+ clean_bdev_bh_alias(bh);
}
}
bh = bh->b_this_page;
@@ -1992,8 +2035,7 @@ int __block_write_begin_int(struct page *page, loff_t pos, unsigned len,
}
if (buffer_new(bh)) {
- unmap_underlying_metadata(bh->b_bdev,
- bh->b_blocknr);
+ clean_bdev_bh_alias(bh);
if (PageUptodate(page)) {
clear_buffer_new(bh);
set_buffer_uptodate(bh);
@@ -2633,7 +2675,7 @@ int nobh_write_begin(struct address_space *mapping,
if (!buffer_mapped(bh))
is_mapped_to_disk = 0;
if (buffer_new(bh))
- unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
+ clean_bdev_bh_alias(bh);
if (PageUptodate(page)) {
set_buffer_uptodate(bh);
continue;
@@ -3118,7 +3160,7 @@ EXPORT_SYMBOL(submit_bh);
/**
* ll_rw_block: low-level access to block devices (DEPRECATED)
* @op: whether to %READ or %WRITE
- * @op_flags: rq_flag_bits
+ * @op_flags: req_flag_bits
* @nr: number of &struct buffer_heads in the array
* @bhs: array of pointers to &struct buffer_head
*
@@ -3210,7 +3252,7 @@ EXPORT_SYMBOL(__sync_dirty_buffer);
int sync_dirty_buffer(struct buffer_head *bh)
{
- return __sync_dirty_buffer(bh, WRITE_SYNC);
+ return __sync_dirty_buffer(bh, REQ_SYNC);
}
EXPORT_SYMBOL(sync_dirty_buffer);
@@ -3403,7 +3445,7 @@ void free_buffer_head(struct buffer_head *bh)
}
EXPORT_SYMBOL(free_buffer_head);
-static void buffer_exit_cpu(int cpu)
+static int buffer_exit_cpu_dead(unsigned int cpu)
{
int i;
struct bh_lru *b = &per_cpu(bh_lrus, cpu);
@@ -3414,14 +3456,7 @@ static void buffer_exit_cpu(int cpu)
}
this_cpu_add(bh_accounting.nr, per_cpu(bh_accounting, cpu).nr);
per_cpu(bh_accounting, cpu).nr = 0;
-}
-
-static int buffer_cpu_notify(struct notifier_block *self,
- unsigned long action, void *hcpu)
-{
- if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
- buffer_exit_cpu((unsigned long)hcpu);
- return NOTIFY_OK;
+ return 0;
}
/**
@@ -3471,6 +3506,7 @@ EXPORT_SYMBOL(bh_submit_read);
void __init buffer_init(void)
{
unsigned long nrpages;
+ int ret;
bh_cachep = kmem_cache_create("buffer_head",
sizeof(struct buffer_head), 0,
@@ -3483,5 +3519,7 @@ void __init buffer_init(void)
*/
nrpages = (nr_free_buffer_pages() * 10) / 100;
max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
- hotcpu_notifier(buffer_cpu_notify, 0);
+ ret = cpuhp_setup_state_nocalls(CPUHP_FS_BUFF_DEAD, "fs/buffer:dead",
+ NULL, buffer_exit_cpu_dead);
+ WARN_ON(ret < 0);
}
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index c23eb0e9348c..d7a93696663b 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -1236,26 +1236,30 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
return -ECHILD;
op = ceph_snap(dir) == CEPH_SNAPDIR ?
- CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
+ CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_GETATTR;
req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);
if (!IS_ERR(req)) {
req->r_dentry = dget(dentry);
- req->r_num_caps = 2;
+ req->r_num_caps = op == CEPH_MDS_OP_GETATTR ? 1 : 2;
mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED;
if (ceph_security_xattr_wanted(dir))
mask |= CEPH_CAP_XATTR_SHARED;
req->r_args.getattr.mask = mask;
- req->r_locked_dir = dir;
err = ceph_mdsc_do_request(mdsc, NULL, req);
- if (err == 0 || err == -ENOENT) {
- if (dentry == req->r_dentry) {
- valid = !d_unhashed(dentry);
- } else {
- d_invalidate(req->r_dentry);
- err = -EAGAIN;
- }
+ switch (err) {
+ case 0:
+ if (d_really_is_positive(dentry) &&
+ d_inode(dentry) == req->r_target_inode)
+ valid = 1;
+ break;
+ case -ENOENT:
+ if (d_really_is_negative(dentry))
+ valid = 1;
+ /* Fallthrough */
+ default:
+ break;
}
ceph_mdsc_put_request(req);
dout("d_revalidate %p lookup result=%d\n",
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 8347c90cf483..5eb04129f938 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -808,7 +808,11 @@ calc_seckey(struct cifs_ses *ses)
struct crypto_skcipher *tfm_arc4;
struct scatterlist sgin, sgout;
struct skcipher_request *req;
- unsigned char sec_key[CIFS_SESS_KEY_SIZE]; /* a nonce */
+ unsigned char *sec_key;
+
+ sec_key = kmalloc(CIFS_SESS_KEY_SIZE, GFP_KERNEL);
+ if (sec_key == NULL)
+ return -ENOMEM;
get_random_bytes(sec_key, CIFS_SESS_KEY_SIZE);
@@ -816,7 +820,7 @@ calc_seckey(struct cifs_ses *ses)
if (IS_ERR(tfm_arc4)) {
rc = PTR_ERR(tfm_arc4);
cifs_dbg(VFS, "could not allocate crypto API arc4\n");
- return rc;
+ goto out;
}
rc = crypto_skcipher_setkey(tfm_arc4, ses->auth_key.response,
@@ -854,7 +858,8 @@ calc_seckey(struct cifs_ses *ses)
out_free_cipher:
crypto_free_skcipher(tfm_arc4);
-
+out:
+ kfree(sec_key);
return rc;
}
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 3f3185febc58..e3fed9249a04 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -3427,6 +3427,7 @@ static __u16 ACL_to_cifs_posix(char *parm_data, const char *pACL,
__u16 rc = 0;
struct cifs_posix_acl *cifs_acl = (struct cifs_posix_acl *)parm_data;
struct posix_acl_xattr_header *local_acl = (void *)pACL;
+ struct posix_acl_xattr_entry *ace = (void *)(local_acl + 1);
int count;
int i;
@@ -3453,8 +3454,7 @@ static __u16 ACL_to_cifs_posix(char *parm_data, const char *pACL,
return 0;
}
for (i = 0; i < count; i++) {
- rc = convert_ace_to_cifs_ace(&cifs_acl->ace_array[i],
- (struct posix_acl_xattr_entry *)(local_acl + 1));
+ rc = convert_ace_to_cifs_ace(&cifs_acl->ace_array[i], &ace[i]);
if (rc != 0) {
/* ACE not converted */
break;
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index aab5227979e2..f7563c88c917 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -41,6 +41,7 @@
#include <keys/user-type.h>
#include <net/ipv6.h>
#include <linux/parser.h>
+#include <linux/bvec.h>
#include "cifspdu.h"
#include "cifsglob.h"
@@ -412,6 +413,9 @@ cifs_reconnect(struct TCP_Server_Info *server)
}
} while (server->tcpStatus == CifsNeedReconnect);
+ if (server->tcpStatus == CifsNeedNegotiate)
+ mod_delayed_work(cifsiod_wq, &server->echo, 0);
+
return rc;
}
@@ -421,17 +425,25 @@ cifs_echo_request(struct work_struct *work)
int rc;
struct TCP_Server_Info *server = container_of(work,
struct TCP_Server_Info, echo.work);
- unsigned long echo_interval = server->echo_interval;
+ unsigned long echo_interval;
+
+ /*
+ * If we need to renegotiate, set echo interval to zero to
+ * immediately call echo service where we can renegotiate.
+ */
+ if (server->tcpStatus == CifsNeedNegotiate)
+ echo_interval = 0;
+ else
+ echo_interval = server->echo_interval;
/*
- * We cannot send an echo if it is disabled or until the
- * NEGOTIATE_PROTOCOL request is done, which is indicated by
- * server->ops->need_neg() == true. Also, no need to ping if
- * we got a response recently.
+ * We cannot send an echo if it is disabled.
+ * Also, no need to ping if we got a response recently.
*/
if (server->tcpStatus == CifsNeedReconnect ||
- server->tcpStatus == CifsExiting || server->tcpStatus == CifsNew ||
+ server->tcpStatus == CifsExiting ||
+ server->tcpStatus == CifsNew ||
(server->ops->can_echo && !server->ops->can_echo(server)) ||
time_before(jiffies, server->lstrp + echo_interval - HZ))
goto requeue_echo;
@@ -442,7 +454,7 @@ cifs_echo_request(struct work_struct *work)
server->hostname);
requeue_echo:
- queue_delayed_work(cifsiod_wq, &server->echo, echo_interval);
+ queue_delayed_work(cifsiod_wq, &server->echo, server->echo_interval);
}
static bool
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 206a597b2293..5f02edc819af 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -28,6 +28,7 @@
#include <linux/delay.h>
#include <linux/freezer.h>
#include <linux/tcp.h>
+#include <linux/bvec.h>
#include <linux/highmem.h>
#include <asm/uaccess.h>
#include <asm/processor.h>
diff --git a/fs/crypto/Kconfig b/fs/crypto/Kconfig
index 92348faf9865..f514978f6688 100644
--- a/fs/crypto/Kconfig
+++ b/fs/crypto/Kconfig
@@ -8,9 +8,7 @@ config FS_ENCRYPTION
select CRYPTO_XTS
select CRYPTO_CTS
select CRYPTO_CTR
- select CRYPTO_SHA256
select KEYS
- select ENCRYPTED_KEYS
help
Enable encryption of files and directories. This
feature is similar to ecryptfs, but it is more memory
diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
index 98f87fe8f186..ac8e4f6a3773 100644
--- a/fs/crypto/crypto.c
+++ b/fs/crypto/crypto.c
@@ -27,7 +27,7 @@
#include <linux/bio.h>
#include <linux/dcache.h>
#include <linux/namei.h>
-#include <linux/fscrypto.h>
+#include "fscrypt_private.h"
static unsigned int num_prealloc_crypto_pages = 32;
static unsigned int num_prealloc_crypto_ctxs = 128;
@@ -63,7 +63,7 @@ void fscrypt_release_ctx(struct fscrypt_ctx *ctx)
{
unsigned long flags;
- if (ctx->flags & FS_WRITE_PATH_FL && ctx->w.bounce_page) {
+ if (ctx->flags & FS_CTX_HAS_BOUNCE_BUFFER_FL && ctx->w.bounce_page) {
mempool_free(ctx->w.bounce_page, fscrypt_bounce_page_pool);
ctx->w.bounce_page = NULL;
}
@@ -88,7 +88,7 @@ EXPORT_SYMBOL(fscrypt_release_ctx);
* Return: An allocated and initialized encryption context on success; error
* value or NULL otherwise.
*/
-struct fscrypt_ctx *fscrypt_get_ctx(struct inode *inode, gfp_t gfp_flags)
+struct fscrypt_ctx *fscrypt_get_ctx(const struct inode *inode, gfp_t gfp_flags)
{
struct fscrypt_ctx *ctx = NULL;
struct fscrypt_info *ci = inode->i_crypt_info;
@@ -121,7 +121,7 @@ struct fscrypt_ctx *fscrypt_get_ctx(struct inode *inode, gfp_t gfp_flags)
} else {
ctx->flags &= ~FS_CTX_REQUIRES_FREE_ENCRYPT_FL;
}
- ctx->flags &= ~FS_WRITE_PATH_FL;
+ ctx->flags &= ~FS_CTX_HAS_BOUNCE_BUFFER_FL;
return ctx;
}
EXPORT_SYMBOL(fscrypt_get_ctx);
@@ -146,9 +146,10 @@ typedef enum {
FS_ENCRYPT,
} fscrypt_direction_t;
-static int do_page_crypto(struct inode *inode,
- fscrypt_direction_t rw, pgoff_t index,
+static int do_page_crypto(const struct inode *inode,
+ fscrypt_direction_t rw, u64 lblk_num,
struct page *src_page, struct page *dest_page,
+ unsigned int len, unsigned int offs,
gfp_t gfp_flags)
{
struct {
@@ -162,6 +163,8 @@ static int do_page_crypto(struct inode *inode,
struct crypto_skcipher *tfm = ci->ci_ctfm;
int res = 0;
+ BUG_ON(len == 0);
+
req = skcipher_request_alloc(tfm, gfp_flags);
if (!req) {
printk_ratelimited(KERN_ERR
@@ -175,14 +178,14 @@ static int do_page_crypto(struct inode *inode,
page_crypt_complete, &ecr);
BUILD_BUG_ON(sizeof(xts_tweak) != FS_XTS_TWEAK_SIZE);
- xts_tweak.index = cpu_to_le64(index);
+ xts_tweak.index = cpu_to_le64(lblk_num);
memset(xts_tweak.padding, 0, sizeof(xts_tweak.padding));
sg_init_table(&dst, 1);
- sg_set_page(&dst, dest_page, PAGE_SIZE, 0);
+ sg_set_page(&dst, dest_page, len, offs);
sg_init_table(&src, 1);
- sg_set_page(&src, src_page, PAGE_SIZE, 0);
- skcipher_request_set_crypt(req, &src, &dst, PAGE_SIZE, &xts_tweak);
+ sg_set_page(&src, src_page, len, offs);
+ skcipher_request_set_crypt(req, &src, &dst, len, &xts_tweak);
if (rw == FS_DECRYPT)
res = crypto_skcipher_decrypt(req);
else
@@ -207,34 +210,66 @@ static struct page *alloc_bounce_page(struct fscrypt_ctx *ctx, gfp_t gfp_flags)
ctx->w.bounce_page = mempool_alloc(fscrypt_bounce_page_pool, gfp_flags);
if (ctx->w.bounce_page == NULL)
return ERR_PTR(-ENOMEM);
- ctx->flags |= FS_WRITE_PATH_FL;
+ ctx->flags |= FS_CTX_HAS_BOUNCE_BUFFER_FL;
return ctx->w.bounce_page;
}
/**
* fscypt_encrypt_page() - Encrypts a page
- * @inode: The inode for which the encryption should take place
- * @plaintext_page: The page to encrypt. Must be locked.
- * @gfp_flags: The gfp flag for memory allocation
+ * @inode: The inode for which the encryption should take place
+ * @page: The page to encrypt. Must be locked for bounce-page
+ * encryption.
+ * @len: Length of data to encrypt in @page and encrypted
+ * data in returned page.
+ * @offs: Offset of data within @page and returned
+ * page holding encrypted data.
+ * @lblk_num: Logical block number. This must be unique for multiple
+ * calls with same inode, except when overwriting
+ * previously written data.
+ * @gfp_flags: The gfp flag for memory allocation
*
- * Allocates a ciphertext page and encrypts plaintext_page into it using the ctx
- * encryption context.
+ * Encrypts @page using the ctx encryption context. Performs encryption
+ * either in-place or into a newly allocated bounce page.
+ * Called on the page write path.
*
- * Called on the page write path. The caller must call
+ * Bounce page allocation is the default.
+ * In this case, the contents of @page are encrypted and stored in an
+ * allocated bounce page. @page has to be locked and the caller must call
* fscrypt_restore_control_page() on the returned ciphertext page to
* release the bounce buffer and the encryption context.
*
- * Return: An allocated page with the encrypted content on success. Else, an
+ * In-place encryption is used by setting the FS_CFLG_OWN_PAGES flag in
+ * fscrypt_operations. Here, the input-page is returned with its content
+ * encrypted.
+ *
+ * Return: A page with the encrypted content on success. Else, an
* error value or NULL.
*/
-struct page *fscrypt_encrypt_page(struct inode *inode,
- struct page *plaintext_page, gfp_t gfp_flags)
+struct page *fscrypt_encrypt_page(const struct inode *inode,
+ struct page *page,
+ unsigned int len,
+ unsigned int offs,
+ u64 lblk_num, gfp_t gfp_flags)
+
{
struct fscrypt_ctx *ctx;
- struct page *ciphertext_page = NULL;
+ struct page *ciphertext_page = page;
int err;
- BUG_ON(!PageLocked(plaintext_page));
+ BUG_ON(len % FS_CRYPTO_BLOCK_SIZE != 0);
+
+ if (inode->i_sb->s_cop->flags & FS_CFLG_OWN_PAGES) {
+ /* with inplace-encryption we just encrypt the page */
+ err = do_page_crypto(inode, FS_ENCRYPT, lblk_num,
+ page, ciphertext_page,
+ len, offs, gfp_flags);
+ if (err)
+ return ERR_PTR(err);
+
+ return ciphertext_page;
+ }
+
+ BUG_ON(!PageLocked(page));
ctx = fscrypt_get_ctx(inode, gfp_flags);
if (IS_ERR(ctx))
@@ -245,10 +280,10 @@ struct page *fscrypt_encrypt_page(struct inode *inode,
if (IS_ERR(ciphertext_page))
goto errout;
- ctx->w.control_page = plaintext_page;
- err = do_page_crypto(inode, FS_ENCRYPT, plaintext_page->index,
- plaintext_page, ciphertext_page,
- gfp_flags);
+ ctx->w.control_page = page;
+ err = do_page_crypto(inode, FS_ENCRYPT, lblk_num,
+ page, ciphertext_page,
+ len, offs, gfp_flags);
if (err) {
ciphertext_page = ERR_PTR(err);
goto errout;
@@ -265,8 +300,13 @@ errout:
EXPORT_SYMBOL(fscrypt_encrypt_page);
/**
- * f2crypt_decrypt_page() - Decrypts a page in-place
- * @page: The page to decrypt. Must be locked.
+ * fscrypt_decrypt_page() - Decrypts a page in-place
+ * @inode: The corresponding inode for the page to decrypt.
+ * @page: The page to decrypt. Must be locked in case
+ * it is a writeback page (FS_CFLG_OWN_PAGES unset).
+ * @len: Number of bytes in @page to be decrypted.
+ * @offs: Start of data in @page.
+ * @lblk_num: Logical block number.
*
* Decrypts page in-place using the ctx encryption context.
*
@@ -274,16 +314,18 @@ EXPORT_SYMBOL(fscrypt_encrypt_page);
*
* Return: Zero on success, non-zero otherwise.
*/
-int fscrypt_decrypt_page(struct page *page)
+int fscrypt_decrypt_page(const struct inode *inode, struct page *page,
+ unsigned int len, unsigned int offs, u64 lblk_num)
{
- BUG_ON(!PageLocked(page));
+ if (!(inode->i_sb->s_cop->flags & FS_CFLG_OWN_PAGES))
+ BUG_ON(!PageLocked(page));
- return do_page_crypto(page->mapping->host,
- FS_DECRYPT, page->index, page, page, GFP_NOFS);
+ return do_page_crypto(inode, FS_DECRYPT, lblk_num, page, page, len,
+ offs, GFP_NOFS);
}
EXPORT_SYMBOL(fscrypt_decrypt_page);
-int fscrypt_zeroout_range(struct inode *inode, pgoff_t lblk,
+int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk,
sector_t pblk, unsigned int len)
{
struct fscrypt_ctx *ctx;
@@ -306,7 +348,7 @@ int fscrypt_zeroout_range(struct inode *inode, pgoff_t lblk,
while (len--) {
err = do_page_crypto(inode, FS_ENCRYPT, lblk,
ZERO_PAGE(0), ciphertext_page,
- GFP_NOFS);
+ PAGE_SIZE, 0, GFP_NOFS);
if (err)
goto errout;
@@ -414,7 +456,8 @@ static void completion_pages(struct work_struct *work)
bio_for_each_segment_all(bv, bio, i) {
struct page *page = bv->bv_page;
- int ret = fscrypt_decrypt_page(page);
+ int ret = fscrypt_decrypt_page(page->mapping->host, page,
+ PAGE_SIZE, 0, page->index);
if (ret) {
WARN_ON_ONCE(1);
@@ -482,17 +525,22 @@ static void fscrypt_destroy(void)
/**
* fscrypt_initialize() - allocate major buffers for fs encryption.
+ * @cop_flags: fscrypt operations flags
*
* We only call this when we start accessing encrypted files, since it
* results in memory getting allocated that wouldn't otherwise be used.
*
* Return: Zero on success, non-zero otherwise.
*/
-int fscrypt_initialize(void)
+int fscrypt_initialize(unsigned int cop_flags)
{
int i, res = -ENOMEM;
- if (fscrypt_bounce_page_pool)
+ /*
+ * No need to allocate a bounce page pool if there already is one or
+ * this FS won't use it.
+ */
+ if (cop_flags & FS_CFLG_OWN_PAGES || fscrypt_bounce_page_pool)
return 0;
mutex_lock(&fscrypt_init_mutex);
@@ -521,7 +569,6 @@ fail:
mutex_unlock(&fscrypt_init_mutex);
return res;
}
-EXPORT_SYMBOL(fscrypt_initialize);
/**
* fscrypt_init() - Set up for fs encryption.
diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c
index 9b774f4b50c8..56ad9d195f18 100644
--- a/fs/crypto/fname.c
+++ b/fs/crypto/fname.c
@@ -12,7 +12,7 @@
#include <linux/scatterlist.h>
#include <linux/ratelimit.h>
-#include <linux/fscrypto.h>
+#include "fscrypt_private.h"
/**
* fname_crypt_complete() - completion callback for filename crypto
@@ -209,7 +209,7 @@ static int digest_decode(const char *src, int len, char *dst)
return cp - dst;
}
-u32 fscrypt_fname_encrypted_size(struct inode *inode, u32 ilen)
+u32 fscrypt_fname_encrypted_size(const struct inode *inode, u32 ilen)
{
int padding = 32;
struct fscrypt_info *ci = inode->i_crypt_info;
@@ -227,7 +227,7 @@ EXPORT_SYMBOL(fscrypt_fname_encrypted_size);
* Allocates an output buffer that is sufficient for the crypto operation
* specified by the context and the direction.
*/
-int fscrypt_fname_alloc_buffer(struct inode *inode,
+int fscrypt_fname_alloc_buffer(const struct inode *inode,
u32 ilen, struct fscrypt_str *crypto_str)
{
unsigned int olen = fscrypt_fname_encrypted_size(inode, ilen);
@@ -350,7 +350,7 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname,
fname->disk_name.len = iname->len;
return 0;
}
- ret = get_crypt_info(dir);
+ ret = fscrypt_get_crypt_info(dir);
if (ret && ret != -EOPNOTSUPP)
return ret;
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
new file mode 100644
index 000000000000..aeab032d7d35
--- /dev/null
+++ b/fs/crypto/fscrypt_private.h
@@ -0,0 +1,93 @@
+/*
+ * fscrypt_private.h
+ *
+ * Copyright (C) 2015, Google, Inc.
+ *
+ * This contains encryption key functions.
+ *
+ * Written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar, 2015.
+ */
+
+#ifndef _FSCRYPT_PRIVATE_H
+#define _FSCRYPT_PRIVATE_H
+
+#include <linux/fscrypto.h>
+
+#define FS_FNAME_CRYPTO_DIGEST_SIZE 32
+
+/* Encryption parameters */
+#define FS_XTS_TWEAK_SIZE 16
+#define FS_AES_128_ECB_KEY_SIZE 16
+#define FS_AES_256_GCM_KEY_SIZE 32
+#define FS_AES_256_CBC_KEY_SIZE 32
+#define FS_AES_256_CTS_KEY_SIZE 32
+#define FS_AES_256_XTS_KEY_SIZE 64
+#define FS_MAX_KEY_SIZE 64
+
+#define FS_KEY_DESC_PREFIX "fscrypt:"
+#define FS_KEY_DESC_PREFIX_SIZE 8
+
+#define FS_KEY_DERIVATION_NONCE_SIZE 16
+
+/**
+ * Encryption context for inode
+ *
+ * Protector format:
+ * 1 byte: Protector format (1 = this version)
+ * 1 byte: File contents encryption mode
+ * 1 byte: File names encryption mode
+ * 1 byte: Flags
+ * 8 bytes: Master Key descriptor
+ * 16 bytes: Encryption Key derivation nonce
+ */
+struct fscrypt_context {
+ u8 format;
+ u8 contents_encryption_mode;
+ u8 filenames_encryption_mode;
+ u8 flags;
+ u8 master_key_descriptor[FS_KEY_DESCRIPTOR_SIZE];
+ u8 nonce[FS_KEY_DERIVATION_NONCE_SIZE];
+} __packed;
+
+#define FS_ENCRYPTION_CONTEXT_FORMAT_V1 1
+
+/* This is passed in from userspace into the kernel keyring */
+struct fscrypt_key {
+ u32 mode;
+ u8 raw[FS_MAX_KEY_SIZE];
+ u32 size;
+} __packed;
+
+/*
+ * A pointer to this structure is stored in the file system's in-core
+ * representation of an inode.
+ */
+struct fscrypt_info {
+ u8 ci_data_mode;
+ u8 ci_filename_mode;
+ u8 ci_flags;
+ struct crypto_skcipher *ci_ctfm;
+ struct key *ci_keyring_key;
+ u8 ci_master_key[FS_KEY_DESCRIPTOR_SIZE];
+};
+
+#define FS_CTX_REQUIRES_FREE_ENCRYPT_FL 0x00000001
+#define FS_CTX_HAS_BOUNCE_BUFFER_FL 0x00000002
+
+struct fscrypt_completion_result {
+ struct completion completion;
+ int res;
+};
+
+#define DECLARE_FS_COMPLETION_RESULT(ecr) \
+ struct fscrypt_completion_result ecr = { \
+ COMPLETION_INITIALIZER((ecr).completion), 0 }
+
+
+/* crypto.c */
+int fscrypt_initialize(unsigned int cop_flags);
+
+/* keyinfo.c */
+extern int fscrypt_get_crypt_info(struct inode *);
+
+#endif /* _FSCRYPT_PRIVATE_H */
diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c
index 67fb6d8876d0..6eeea1dcba41 100644
--- a/fs/crypto/keyinfo.c
+++ b/fs/crypto/keyinfo.c
@@ -10,7 +10,7 @@
#include <keys/user-type.h>
#include <linux/scatterlist.h>
-#include <linux/fscrypto.h>
+#include "fscrypt_private.h"
static void derive_crypt_complete(struct crypto_async_request *req, int rc)
{
@@ -178,7 +178,7 @@ static void put_crypt_info(struct fscrypt_info *ci)
kmem_cache_free(fscrypt_info_cachep, ci);
}
-int get_crypt_info(struct inode *inode)
+int fscrypt_get_crypt_info(struct inode *inode)
{
struct fscrypt_info *crypt_info;
struct fscrypt_context ctx;
@@ -188,7 +188,7 @@ int get_crypt_info(struct inode *inode)
u8 *raw_key = NULL;
int res;
- res = fscrypt_initialize();
+ res = fscrypt_initialize(inode->i_sb->s_cop->flags);
if (res)
return res;
@@ -327,7 +327,7 @@ int fscrypt_get_encryption_info(struct inode *inode)
(ci->ci_keyring_key->flags & ((1 << KEY_FLAG_INVALIDATED) |
(1 << KEY_FLAG_REVOKED) |
(1 << KEY_FLAG_DEAD)))))
- return get_crypt_info(inode);
+ return fscrypt_get_crypt_info(inode);
return 0;
}
EXPORT_SYMBOL(fscrypt_get_encryption_info);
diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c
index 6865663aac69..6ed7c2eebeec 100644
--- a/fs/crypto/policy.c
+++ b/fs/crypto/policy.c
@@ -10,8 +10,8 @@
#include <linux/random.h>
#include <linux/string.h>
-#include <linux/fscrypto.h>
#include <linux/mount.h>
+#include "fscrypt_private.h"
static int inode_has_encryption_context(struct inode *inode)
{
@@ -93,16 +93,19 @@ static int create_encryption_context_from_policy(struct inode *inode,
return inode->i_sb->s_cop->set_context(inode, &ctx, sizeof(ctx), NULL);
}
-int fscrypt_process_policy(struct file *filp,
- const struct fscrypt_policy *policy)
+int fscrypt_ioctl_set_policy(struct file *filp, const void __user *arg)
{
+ struct fscrypt_policy policy;
struct inode *inode = file_inode(filp);
int ret;
+ if (copy_from_user(&policy, arg, sizeof(policy)))
+ return -EFAULT;
+
if (!inode_owner_or_capable(inode))
return -EACCES;
- if (policy->version != 0)
+ if (policy.version != 0)
return -EINVAL;
ret = mnt_want_write_file(filp);
@@ -120,9 +123,9 @@ int fscrypt_process_policy(struct file *filp,
ret = -ENOTEMPTY;
else
ret = create_encryption_context_from_policy(inode,
- policy);
+ &policy);
} else if (!is_encryption_context_consistent_with_policy(inode,
- policy)) {
+ &policy)) {
printk(KERN_WARNING
"%s: Policy inconsistent with encryption context\n",
__func__);
@@ -134,11 +137,13 @@ int fscrypt_process_policy(struct file *filp,
mnt_drop_write_file(filp);
return ret;
}
-EXPORT_SYMBOL(fscrypt_process_policy);
+EXPORT_SYMBOL(fscrypt_ioctl_set_policy);
-int fscrypt_get_policy(struct inode *inode, struct fscrypt_policy *policy)
+int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg)
{
+ struct inode *inode = file_inode(filp);
struct fscrypt_context ctx;
+ struct fscrypt_policy policy;
int res;
if (!inode->i_sb->s_cop->get_context ||
@@ -151,15 +156,18 @@ int fscrypt_get_policy(struct inode *inode, struct fscrypt_policy *policy)
if (ctx.format != FS_ENCRYPTION_CONTEXT_FORMAT_V1)
return -EINVAL;
- policy->version = 0;
- policy->contents_encryption_mode = ctx.contents_encryption_mode;
- policy->filenames_encryption_mode = ctx.filenames_encryption_mode;
- policy->flags = ctx.flags;
- memcpy(&policy->master_key_descriptor, ctx.master_key_descriptor,
+ policy.version = 0;
+ policy.contents_encryption_mode = ctx.contents_encryption_mode;
+ policy.filenames_encryption_mode = ctx.filenames_encryption_mode;
+ policy.flags = ctx.flags;
+ memcpy(policy.master_key_descriptor, ctx.master_key_descriptor,
FS_KEY_DESCRIPTOR_SIZE);
+
+ if (copy_to_user(arg, &policy, sizeof(policy)))
+ return -EFAULT;
return 0;
}
-EXPORT_SYMBOL(fscrypt_get_policy);
+EXPORT_SYMBOL(fscrypt_ioctl_get_policy);
int fscrypt_has_permitted_context(struct inode *parent, struct inode *child)
{
diff --git a/fs/dax.c b/fs/dax.c
index 014defd2e744..a8732fbed381 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -31,28 +31,15 @@
#include <linux/vmstat.h>
#include <linux/pfn_t.h>
#include <linux/sizes.h>
+#include <linux/mmu_notifier.h>
#include <linux/iomap.h>
#include "internal.h"
-/*
- * We use lowest available bit in exceptional entry for locking, other two
- * bits to determine entry type. In total 3 special bits.
- */
-#define RADIX_DAX_SHIFT (RADIX_TREE_EXCEPTIONAL_SHIFT + 3)
-#define RADIX_DAX_PTE (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1))
-#define RADIX_DAX_PMD (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2))
-#define RADIX_DAX_TYPE_MASK (RADIX_DAX_PTE | RADIX_DAX_PMD)
-#define RADIX_DAX_TYPE(entry) ((unsigned long)entry & RADIX_DAX_TYPE_MASK)
-#define RADIX_DAX_SECTOR(entry) (((unsigned long)entry >> RADIX_DAX_SHIFT))
-#define RADIX_DAX_ENTRY(sector, pmd) ((void *)((unsigned long)sector << \
- RADIX_DAX_SHIFT | (pmd ? RADIX_DAX_PMD : RADIX_DAX_PTE) | \
- RADIX_TREE_EXCEPTIONAL_ENTRY))
-
/* We choose 4096 entries - same as per-zone page wait tables */
#define DAX_WAIT_TABLE_BITS 12
#define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS)
-wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES];
+static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES];
static int __init init_dax_wait_table(void)
{
@@ -64,14 +51,6 @@ static int __init init_dax_wait_table(void)
}
fs_initcall(init_dax_wait_table);
-static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping,
- pgoff_t index)
-{
- unsigned long hash = hash_long((unsigned long)mapping ^ index,
- DAX_WAIT_TABLE_BITS);
- return wait_table + hash;
-}
-
static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax)
{
struct request_queue *q = bdev->bd_queue;
@@ -98,209 +77,52 @@ static void dax_unmap_atomic(struct block_device *bdev,
blk_queue_exit(bdev->bd_queue);
}
-struct page *read_dax_sector(struct block_device *bdev, sector_t n)
+static int dax_is_pmd_entry(void *entry)
{
- struct page *page = alloc_pages(GFP_KERNEL, 0);
- struct blk_dax_ctl dax = {
- .size = PAGE_SIZE,
- .sector = n & ~((((int) PAGE_SIZE) / 512) - 1),
- };
- long rc;
-
- if (!page)
- return ERR_PTR(-ENOMEM);
-
- rc = dax_map_atomic(bdev, &dax);
- if (rc < 0)
- return ERR_PTR(rc);
- memcpy_from_pmem(page_address(page), dax.addr, PAGE_SIZE);
- dax_unmap_atomic(bdev, &dax);
- return page;
+ return (unsigned long)entry & RADIX_DAX_PMD;
}
-static bool buffer_written(struct buffer_head *bh)
+static int dax_is_pte_entry(void *entry)
{
- return buffer_mapped(bh) && !buffer_unwritten(bh);
+ return !((unsigned long)entry & RADIX_DAX_PMD);
}
-/*
- * When ext4 encounters a hole, it returns without modifying the buffer_head
- * which means that we can't trust b_size. To cope with this, we set b_state
- * to 0 before calling get_block and, if any bit is set, we know we can trust
- * b_size. Unfortunate, really, since ext4 knows precisely how long a hole is
- * and would save us time calling get_block repeatedly.
- */
-static bool buffer_size_valid(struct buffer_head *bh)
+static int dax_is_zero_entry(void *entry)
{
- return bh->b_state != 0;
+ return (unsigned long)entry & RADIX_DAX_HZP;
}
-
-static sector_t to_sector(const struct buffer_head *bh,
- const struct inode *inode)
+static int dax_is_empty_entry(void *entry)
{
- sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9);
-
- return sector;
+ return (unsigned long)entry & RADIX_DAX_EMPTY;
}
-static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
- loff_t start, loff_t end, get_block_t get_block,
- struct buffer_head *bh)
+struct page *read_dax_sector(struct block_device *bdev, sector_t n)
{
- loff_t pos = start, max = start, bh_max = start;
- bool hole = false;
- struct block_device *bdev = NULL;
- int rw = iov_iter_rw(iter), rc;
- long map_len = 0;
+ struct page *page = alloc_pages(GFP_KERNEL, 0);
struct blk_dax_ctl dax = {
- .addr = ERR_PTR(-EIO),
+ .size = PAGE_SIZE,
+ .sector = n & ~((((int) PAGE_SIZE) / 512) - 1),
};
- unsigned blkbits = inode->i_blkbits;
- sector_t file_blks = (i_size_read(inode) + (1 << blkbits) - 1)
- >> blkbits;
-
- if (rw == READ)
- end = min(end, i_size_read(inode));
-
- while (pos < end) {
- size_t len;
- if (pos == max) {
- long page = pos >> PAGE_SHIFT;
- sector_t block = page << (PAGE_SHIFT - blkbits);
- unsigned first = pos - (block << blkbits);
- long size;
-
- if (pos == bh_max) {
- bh->b_size = PAGE_ALIGN(end - pos);
- bh->b_state = 0;
- rc = get_block(inode, block, bh, rw == WRITE);
- if (rc)
- break;
- if (!buffer_size_valid(bh))
- bh->b_size = 1 << blkbits;
- bh_max = pos - first + bh->b_size;
- bdev = bh->b_bdev;
- /*
- * We allow uninitialized buffers for writes
- * beyond EOF as those cannot race with faults
- */
- WARN_ON_ONCE(
- (buffer_new(bh) && block < file_blks) ||
- (rw == WRITE && buffer_unwritten(bh)));
- } else {
- unsigned done = bh->b_size -
- (bh_max - (pos - first));
- bh->b_blocknr += done >> blkbits;
- bh->b_size -= done;
- }
-
- hole = rw == READ && !buffer_written(bh);
- if (hole) {
- size = bh->b_size - first;
- } else {
- dax_unmap_atomic(bdev, &dax);
- dax.sector = to_sector(bh, inode);
- dax.size = bh->b_size;
- map_len = dax_map_atomic(bdev, &dax);
- if (map_len < 0) {
- rc = map_len;
- break;
- }
- dax.addr += first;
- size = map_len - first;
- }
- /*
- * pos + size is one past the last offset for IO,
- * so pos + size can overflow loff_t at extreme offsets.
- * Cast to u64 to catch this and get the true minimum.
- */
- max = min_t(u64, pos + size, end);
- }
-
- if (iov_iter_rw(iter) == WRITE) {
- len = copy_from_iter_pmem(dax.addr, max - pos, iter);
- } else if (!hole)
- len = copy_to_iter((void __force *) dax.addr, max - pos,
- iter);
- else
- len = iov_iter_zero(max - pos, iter);
-
- if (!len) {
- rc = -EFAULT;
- break;
- }
+ long rc;
- pos += len;
- if (!IS_ERR(dax.addr))
- dax.addr += len;
- }
+ if (!page)
+ return ERR_PTR(-ENOMEM);
+ rc = dax_map_atomic(bdev, &dax);
+ if (rc < 0)
+ return ERR_PTR(rc);
+ memcpy_from_pmem(page_address(page), dax.addr, PAGE_SIZE);
dax_unmap_atomic(bdev, &dax);
-
- return (pos == start) ? rc : pos - start;
-}
-
-/**
- * dax_do_io - Perform I/O to a DAX file
- * @iocb: The control block for this I/O
- * @inode: The file which the I/O is directed at
- * @iter: The addresses to do I/O from or to
- * @get_block: The filesystem method used to translate file offsets to blocks
- * @end_io: A filesystem callback for I/O completion
- * @flags: See below
- *
- * This function uses the same locking scheme as do_blockdev_direct_IO:
- * If @flags has DIO_LOCKING set, we assume that the i_mutex is held by the
- * caller for writes. For reads, we take and release the i_mutex ourselves.
- * If DIO_LOCKING is not set, the filesystem takes care of its own locking.
- * As with do_blockdev_direct_IO(), we increment i_dio_count while the I/O
- * is in progress.
- */
-ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode,
- struct iov_iter *iter, get_block_t get_block,
- dio_iodone_t end_io, int flags)
-{
- struct buffer_head bh;
- ssize_t retval = -EINVAL;
- loff_t pos = iocb->ki_pos;
- loff_t end = pos + iov_iter_count(iter);
-
- memset(&bh, 0, sizeof(bh));
- bh.b_bdev = inode->i_sb->s_bdev;
-
- if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ)
- inode_lock(inode);
-
- /* Protects against truncate */
- if (!(flags & DIO_SKIP_DIO_COUNT))
- inode_dio_begin(inode);
-
- retval = dax_io(inode, iter, pos, end, get_block, &bh);
-
- if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ)
- inode_unlock(inode);
-
- if (end_io) {
- int err;
-
- err = end_io(iocb, pos, retval, bh.b_private);
- if (err)
- retval = err;
- }
-
- if (!(flags & DIO_SKIP_DIO_COUNT))
- inode_dio_end(inode);
- return retval;
+ return page;
}
-EXPORT_SYMBOL_GPL(dax_do_io);
/*
* DAX radix tree locking
*/
struct exceptional_entry_key {
struct address_space *mapping;
- unsigned long index;
+ pgoff_t entry_start;
};
struct wait_exceptional_entry_queue {
@@ -308,6 +130,26 @@ struct wait_exceptional_entry_queue {
struct exceptional_entry_key key;
};
+static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping,
+ pgoff_t index, void *entry, struct exceptional_entry_key *key)
+{
+ unsigned long hash;
+
+ /*
+ * If 'entry' is a PMD, align the 'index' that we use for the wait
+ * queue to the start of that PMD. This ensures that all offsets in
+ * the range covered by the PMD map to the same bit lock.
+ */
+ if (dax_is_pmd_entry(entry))
+ index &= ~((1UL << (PMD_SHIFT - PAGE_SHIFT)) - 1);
+
+ key->mapping = mapping;
+ key->entry_start = index;
+
+ hash = hash_long((unsigned long)mapping ^ index, DAX_WAIT_TABLE_BITS);
+ return wait_table + hash;
+}
+
static int wake_exceptional_entry_func(wait_queue_t *wait, unsigned int mode,
int sync, void *keyp)
{
@@ -316,7 +158,7 @@ static int wake_exceptional_entry_func(wait_queue_t *wait, unsigned int mode,
container_of(wait, struct wait_exceptional_entry_queue, wait);
if (key->mapping != ewait->key.mapping ||
- key->index != ewait->key.index)
+ key->entry_start != ewait->key.entry_start)
return 0;
return autoremove_wake_function(wait, mode, sync, NULL);
}
@@ -342,7 +184,7 @@ static inline void *lock_slot(struct address_space *mapping, void **slot)
radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
entry |= RADIX_DAX_ENTRY_LOCK;
- radix_tree_replace_slot(slot, (void *)entry);
+ radix_tree_replace_slot(&mapping->page_tree, slot, (void *)entry);
return (void *)entry;
}
@@ -356,7 +198,7 @@ static inline void *unlock_slot(struct address_space *mapping, void **slot)
radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
entry &= ~(unsigned long)RADIX_DAX_ENTRY_LOCK;
- radix_tree_replace_slot(slot, (void *)entry);
+ radix_tree_replace_slot(&mapping->page_tree, slot, (void *)entry);
return (void *)entry;
}
@@ -372,24 +214,24 @@ static inline void *unlock_slot(struct address_space *mapping, void **slot)
static void *get_unlocked_mapping_entry(struct address_space *mapping,
pgoff_t index, void ***slotp)
{
- void *ret, **slot;
+ void *entry, **slot;
struct wait_exceptional_entry_queue ewait;
- wait_queue_head_t *wq = dax_entry_waitqueue(mapping, index);
+ wait_queue_head_t *wq;
init_wait(&ewait.wait);
ewait.wait.func = wake_exceptional_entry_func;
- ewait.key.mapping = mapping;
- ewait.key.index = index;
for (;;) {
- ret = __radix_tree_lookup(&mapping->page_tree, index, NULL,
+ entry = __radix_tree_lookup(&mapping->page_tree, index, NULL,
&slot);
- if (!ret || !radix_tree_exceptional_entry(ret) ||
+ if (!entry || !radix_tree_exceptional_entry(entry) ||
!slot_locked(mapping, slot)) {
if (slotp)
*slotp = slot;
- return ret;
+ return entry;
}
+
+ wq = dax_entry_waitqueue(mapping, index, entry, &ewait.key);
prepare_to_wait_exclusive(wq, &ewait.wait,
TASK_UNINTERRUPTIBLE);
spin_unlock_irq(&mapping->tree_lock);
@@ -399,52 +241,173 @@ static void *get_unlocked_mapping_entry(struct address_space *mapping,
}
}
+static void dax_unlock_mapping_entry(struct address_space *mapping,
+ pgoff_t index)
+{
+ void *entry, **slot;
+
+ spin_lock_irq(&mapping->tree_lock);
+ entry = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot);
+ if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry) ||
+ !slot_locked(mapping, slot))) {
+ spin_unlock_irq(&mapping->tree_lock);
+ return;
+ }
+ unlock_slot(mapping, slot);
+ spin_unlock_irq(&mapping->tree_lock);
+ dax_wake_mapping_entry_waiter(mapping, index, entry, false);
+}
+
+static void put_locked_mapping_entry(struct address_space *mapping,
+ pgoff_t index, void *entry)
+{
+ if (!radix_tree_exceptional_entry(entry)) {
+ unlock_page(entry);
+ put_page(entry);
+ } else {
+ dax_unlock_mapping_entry(mapping, index);
+ }
+}
+
+/*
+ * Called when we are done with radix tree entry we looked up via
+ * get_unlocked_mapping_entry() and which we didn't lock in the end.
+ */
+static void put_unlocked_mapping_entry(struct address_space *mapping,
+ pgoff_t index, void *entry)
+{
+ if (!radix_tree_exceptional_entry(entry))
+ return;
+
+ /* We have to wake up next waiter for the radix tree entry lock */
+ dax_wake_mapping_entry_waiter(mapping, index, entry, false);
+}
+
/*
* Find radix tree entry at given index. If it points to a page, return with
* the page locked. If it points to the exceptional entry, return with the
* radix tree entry locked. If the radix tree doesn't contain given index,
* create empty exceptional entry for the index and return with it locked.
*
+ * When requesting an entry with size RADIX_DAX_PMD, grab_mapping_entry() will
+ * either return that locked entry or will return an error. This error will
+ * happen if there are any 4k entries (either zero pages or DAX entries)
+ * within the 2MiB range that we are requesting.
+ *
+ * We always favor 4k entries over 2MiB entries. There isn't a flow where we
+ * evict 4k entries in order to 'upgrade' them to a 2MiB entry. A 2MiB
+ * insertion will fail if it finds any 4k entries already in the tree, and a
+ * 4k insertion will cause an existing 2MiB entry to be unmapped and
+ * downgraded to 4k entries. This happens for both 2MiB huge zero pages as
+ * well as 2MiB empty entries.
+ *
+ * The exception to this downgrade path is for 2MiB DAX PMD entries that have
+ * real storage backing them. We will leave these real 2MiB DAX entries in
+ * the tree, and PTE writes will simply dirty the entire 2MiB DAX entry.
+ *
* Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For
* persistent memory the benefit is doubtful. We can add that later if we can
* show it helps.
*/
-static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index)
+static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index,
+ unsigned long size_flag)
{
- void *ret, **slot;
+ bool pmd_downgrade = false; /* splitting 2MiB entry into 4k entries? */
+ void *entry, **slot;
restart:
spin_lock_irq(&mapping->tree_lock);
- ret = get_unlocked_mapping_entry(mapping, index, &slot);
+ entry = get_unlocked_mapping_entry(mapping, index, &slot);
+
+ if (entry) {
+ if (size_flag & RADIX_DAX_PMD) {
+ if (!radix_tree_exceptional_entry(entry) ||
+ dax_is_pte_entry(entry)) {
+ put_unlocked_mapping_entry(mapping, index,
+ entry);
+ entry = ERR_PTR(-EEXIST);
+ goto out_unlock;
+ }
+ } else { /* trying to grab a PTE entry */
+ if (radix_tree_exceptional_entry(entry) &&
+ dax_is_pmd_entry(entry) &&
+ (dax_is_zero_entry(entry) ||
+ dax_is_empty_entry(entry))) {
+ pmd_downgrade = true;
+ }
+ }
+ }
+
/* No entry for given index? Make sure radix tree is big enough. */
- if (!ret) {
+ if (!entry || pmd_downgrade) {
int err;
+ if (pmd_downgrade) {
+ /*
+ * Make sure 'entry' remains valid while we drop
+ * mapping->tree_lock.
+ */
+ entry = lock_slot(mapping, slot);
+ }
+
spin_unlock_irq(&mapping->tree_lock);
+ /*
+ * Besides huge zero pages the only other thing that gets
+ * downgraded are empty entries which don't need to be
+ * unmapped.
+ */
+ if (pmd_downgrade && dax_is_zero_entry(entry))
+ unmap_mapping_range(mapping,
+ (index << PAGE_SHIFT) & PMD_MASK, PMD_SIZE, 0);
+
err = radix_tree_preload(
mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM);
- if (err)
+ if (err) {
+ if (pmd_downgrade)
+ put_locked_mapping_entry(mapping, index, entry);
return ERR_PTR(err);
- ret = (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY |
- RADIX_DAX_ENTRY_LOCK);
+ }
spin_lock_irq(&mapping->tree_lock);
- err = radix_tree_insert(&mapping->page_tree, index, ret);
+
+ if (pmd_downgrade) {
+ radix_tree_delete(&mapping->page_tree, index);
+ mapping->nrexceptional--;
+ dax_wake_mapping_entry_waiter(mapping, index, entry,
+ true);
+ }
+
+ entry = dax_radix_locked_entry(0, size_flag | RADIX_DAX_EMPTY);
+
+ err = __radix_tree_insert(&mapping->page_tree, index,
+ dax_radix_order(entry), entry);
radix_tree_preload_end();
if (err) {
spin_unlock_irq(&mapping->tree_lock);
- /* Someone already created the entry? */
- if (err == -EEXIST)
+ /*
+ * Someone already created the entry? This is a
+ * normal failure when inserting PMDs in a range
+ * that already contains PTEs. In that case we want
+ * to return -EEXIST immediately.
+ */
+ if (err == -EEXIST && !(size_flag & RADIX_DAX_PMD))
goto restart;
+ /*
+ * Our insertion of a DAX PMD entry failed, most
+ * likely because it collided with a PTE sized entry
+ * at a different index in the PMD range. We haven't
+ * inserted anything into the radix tree and have no
+ * waiters to wake.
+ */
return ERR_PTR(err);
}
/* Good, we have inserted empty locked entry into the tree. */
mapping->nrexceptional++;
spin_unlock_irq(&mapping->tree_lock);
- return ret;
+ return entry;
}
/* Normal page in radix tree? */
- if (!radix_tree_exceptional_entry(ret)) {
- struct page *page = ret;
+ if (!radix_tree_exceptional_entry(entry)) {
+ struct page *page = entry;
get_page(page);
spin_unlock_irq(&mapping->tree_lock);
@@ -457,15 +420,26 @@ restart:
}
return page;
}
- ret = lock_slot(mapping, slot);
+ entry = lock_slot(mapping, slot);
+ out_unlock:
spin_unlock_irq(&mapping->tree_lock);
- return ret;
+ return entry;
}
+/*
+ * We do not necessarily hold the mapping->tree_lock when we call this
+ * function so it is possible that 'entry' is no longer a valid item in the
+ * radix tree. This is okay because all we really need to do is to find the
+ * correct waitqueue where tasks might be waiting for that old 'entry' and
+ * wake them.
+ */
void dax_wake_mapping_entry_waiter(struct address_space *mapping,
- pgoff_t index, bool wake_all)
+ pgoff_t index, void *entry, bool wake_all)
{
- wait_queue_head_t *wq = dax_entry_waitqueue(mapping, index);
+ struct exceptional_entry_key key;
+ wait_queue_head_t *wq;
+
+ wq = dax_entry_waitqueue(mapping, index, entry, &key);
/*
* Checking for locked entry and prepare_to_wait_exclusive() happens
@@ -473,54 +447,8 @@ void dax_wake_mapping_entry_waiter(struct address_space *mapping,
* So at this point all tasks that could have seen our entry locked
* must be in the waitqueue and the following check will see them.
*/
- if (waitqueue_active(wq)) {
- struct exceptional_entry_key key;
-
- key.mapping = mapping;
- key.index = index;
+ if (waitqueue_active(wq))
__wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key);
- }
-}
-
-void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index)
-{
- void *ret, **slot;
-
- spin_lock_irq(&mapping->tree_lock);
- ret = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot);
- if (WARN_ON_ONCE(!ret || !radix_tree_exceptional_entry(ret) ||
- !slot_locked(mapping, slot))) {
- spin_unlock_irq(&mapping->tree_lock);
- return;
- }
- unlock_slot(mapping, slot);
- spin_unlock_irq(&mapping->tree_lock);
- dax_wake_mapping_entry_waiter(mapping, index, false);
-}
-
-static void put_locked_mapping_entry(struct address_space *mapping,
- pgoff_t index, void *entry)
-{
- if (!radix_tree_exceptional_entry(entry)) {
- unlock_page(entry);
- put_page(entry);
- } else {
- dax_unlock_mapping_entry(mapping, index);
- }
-}
-
-/*
- * Called when we are done with radix tree entry we looked up via
- * get_unlocked_mapping_entry() and which we didn't lock in the end.
- */
-static void put_unlocked_mapping_entry(struct address_space *mapping,
- pgoff_t index, void *entry)
-{
- if (!radix_tree_exceptional_entry(entry))
- return;
-
- /* We have to wake up next waiter for the radix tree entry lock */
- dax_wake_mapping_entry_waiter(mapping, index, false);
}
/*
@@ -547,7 +475,7 @@ int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
radix_tree_delete(&mapping->page_tree, index);
mapping->nrexceptional--;
spin_unlock_irq(&mapping->tree_lock);
- dax_wake_mapping_entry_waiter(mapping, index, true);
+ dax_wake_mapping_entry_waiter(mapping, index, entry, true);
return 1;
}
@@ -574,10 +502,8 @@ static int dax_load_hole(struct address_space *mapping, void *entry,
/* This will replace locked radix tree entry with a hole page */
page = find_or_create_page(mapping, vmf->pgoff,
vmf->gfp_mask | __GFP_ZERO);
- if (!page) {
- put_locked_mapping_entry(mapping, vmf->pgoff, entry);
+ if (!page)
return VM_FAULT_OOM;
- }
vmf->page = page;
return VM_FAULT_LOCKED;
}
@@ -600,11 +526,17 @@ static int copy_user_dax(struct block_device *bdev, sector_t sector, size_t size
return 0;
}
-#define DAX_PMD_INDEX(page_index) (page_index & (PMD_MASK >> PAGE_SHIFT))
-
+/*
+ * By this point grab_mapping_entry() has ensured that we have a locked entry
+ * of the appropriate size so we don't have to worry about downgrading PMDs to
+ * PTEs. If we happen to be trying to insert a PTE and there is a PMD
+ * already in the tree, we will skip the insertion and just dirty the PMD as
+ * appropriate.
+ */
static void *dax_insert_mapping_entry(struct address_space *mapping,
struct vm_fault *vmf,
- void *entry, sector_t sector)
+ void *entry, sector_t sector,
+ unsigned long flags)
{
struct radix_tree_root *page_tree = &mapping->page_tree;
int error = 0;
@@ -627,28 +559,43 @@ static void *dax_insert_mapping_entry(struct address_space *mapping,
error = radix_tree_preload(vmf->gfp_mask & ~__GFP_HIGHMEM);
if (error)
return ERR_PTR(error);
+ } else if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_HZP)) {
+ /* replacing huge zero page with PMD block mapping */
+ unmap_mapping_range(mapping,
+ (vmf->pgoff << PAGE_SHIFT) & PMD_MASK, PMD_SIZE, 0);
}
spin_lock_irq(&mapping->tree_lock);
- new_entry = (void *)((unsigned long)RADIX_DAX_ENTRY(sector, false) |
- RADIX_DAX_ENTRY_LOCK);
+ new_entry = dax_radix_locked_entry(sector, flags);
+
if (hole_fill) {
__delete_from_page_cache(entry, NULL);
/* Drop pagecache reference */
put_page(entry);
- error = radix_tree_insert(page_tree, index, new_entry);
+ error = __radix_tree_insert(page_tree, index,
+ dax_radix_order(new_entry), new_entry);
if (error) {
new_entry = ERR_PTR(error);
goto unlock;
}
mapping->nrexceptional++;
- } else {
+ } else if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
+ /*
+ * Only swap our new entry into the radix tree if the current
+ * entry is a zero page or an empty entry. If a normal PTE or
+ * PMD entry is already in the tree, we leave it alone. This
+ * means that if we are trying to insert a PTE and the
+ * existing entry is a PMD, we will just leave the PMD in the
+ * tree and dirty it if necessary.
+ */
+ struct radix_tree_node *node;
void **slot;
void *ret;
- ret = __radix_tree_lookup(page_tree, index, NULL, &slot);
+ ret = __radix_tree_lookup(page_tree, index, &node, &slot);
WARN_ON_ONCE(ret != entry);
- radix_tree_replace_slot(slot, new_entry);
+ __radix_tree_replace(page_tree, node, slot,
+ new_entry, NULL, NULL);
}
if (vmf->flags & FAULT_FLAG_WRITE)
radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY);
@@ -668,63 +615,150 @@ static void *dax_insert_mapping_entry(struct address_space *mapping,
return new_entry;
}
+static inline unsigned long
+pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma)
+{
+ unsigned long address;
+
+ address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
+ VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
+ return address;
+}
+
+/* Walk all mappings of a given index of a file and writeprotect them */
+static void dax_mapping_entry_mkclean(struct address_space *mapping,
+ pgoff_t index, unsigned long pfn)
+{
+ struct vm_area_struct *vma;
+ pte_t *ptep;
+ pte_t pte;
+ spinlock_t *ptl;
+ bool changed;
+
+ i_mmap_lock_read(mapping);
+ vma_interval_tree_foreach(vma, &mapping->i_mmap, index, index) {
+ unsigned long address;
+
+ cond_resched();
+
+ if (!(vma->vm_flags & VM_SHARED))
+ continue;
+
+ address = pgoff_address(index, vma);
+ changed = false;
+ if (follow_pte(vma->vm_mm, address, &ptep, &ptl))
+ continue;
+ if (pfn != pte_pfn(*ptep))
+ goto unlock;
+ if (!pte_dirty(*ptep) && !pte_write(*ptep))
+ goto unlock;
+
+ flush_cache_page(vma, address, pfn);
+ pte = ptep_clear_flush(vma, address, ptep);
+ pte = pte_wrprotect(pte);
+ pte = pte_mkclean(pte);
+ set_pte_at(vma->vm_mm, address, ptep, pte);
+ changed = true;
+unlock:
+ pte_unmap_unlock(ptep, ptl);
+
+ if (changed)
+ mmu_notifier_invalidate_page(vma->vm_mm, address);
+ }
+ i_mmap_unlock_read(mapping);
+}
+
static int dax_writeback_one(struct block_device *bdev,
struct address_space *mapping, pgoff_t index, void *entry)
{
struct radix_tree_root *page_tree = &mapping->page_tree;
- int type = RADIX_DAX_TYPE(entry);
- struct radix_tree_node *node;
struct blk_dax_ctl dax;
- void **slot;
+ void *entry2, **slot;
int ret = 0;
- spin_lock_irq(&mapping->tree_lock);
/*
- * Regular page slots are stabilized by the page lock even
- * without the tree itself locked. These unlocked entries
- * need verification under the tree lock.
+ * A page got tagged dirty in DAX mapping? Something is seriously
+ * wrong.
*/
- if (!__radix_tree_lookup(page_tree, index, &node, &slot))
- goto unlock;
- if (*slot != entry)
- goto unlock;
-
- /* another fsync thread may have already written back this entry */
- if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))
- goto unlock;
+ if (WARN_ON(!radix_tree_exceptional_entry(entry)))
+ return -EIO;
- if (WARN_ON_ONCE(type != RADIX_DAX_PTE && type != RADIX_DAX_PMD)) {
+ spin_lock_irq(&mapping->tree_lock);
+ entry2 = get_unlocked_mapping_entry(mapping, index, &slot);
+ /* Entry got punched out / reallocated? */
+ if (!entry2 || !radix_tree_exceptional_entry(entry2))
+ goto put_unlocked;
+ /*
+ * Entry got reallocated elsewhere? No need to writeback. We have to
+ * compare sectors as we must not bail out due to difference in lockbit
+ * or entry type.
+ */
+ if (dax_radix_sector(entry2) != dax_radix_sector(entry))
+ goto put_unlocked;
+ if (WARN_ON_ONCE(dax_is_empty_entry(entry) ||
+ dax_is_zero_entry(entry))) {
ret = -EIO;
- goto unlock;
+ goto put_unlocked;
}
- dax.sector = RADIX_DAX_SECTOR(entry);
- dax.size = (type == RADIX_DAX_PMD ? PMD_SIZE : PAGE_SIZE);
+ /* Another fsync thread may have already written back this entry */
+ if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))
+ goto put_unlocked;
+ /* Lock the entry to serialize with page faults */
+ entry = lock_slot(mapping, slot);
+ /*
+ * We can clear the tag now but we have to be careful so that concurrent
+ * dax_writeback_one() calls for the same index cannot finish before we
+ * actually flush the caches. This is achieved as the calls will look
+ * at the entry only under tree_lock and once they do that they will
+ * see the entry locked and wait for it to unlock.
+ */
+ radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE);
spin_unlock_irq(&mapping->tree_lock);
/*
+ * Even if dax_writeback_mapping_range() was given a wbc->range_start
+ * in the middle of a PMD, the 'index' we are given will be aligned to
+ * the start index of the PMD, as will the sector we pull from
+ * 'entry'. This allows us to flush for PMD_SIZE and not have to
+ * worry about partial PMD writebacks.
+ */
+ dax.sector = dax_radix_sector(entry);
+ dax.size = PAGE_SIZE << dax_radix_order(entry);
+
+ /*
* We cannot hold tree_lock while calling dax_map_atomic() because it
* eventually calls cond_resched().
*/
ret = dax_map_atomic(bdev, &dax);
- if (ret < 0)
+ if (ret < 0) {
+ put_locked_mapping_entry(mapping, index, entry);
return ret;
+ }
if (WARN_ON_ONCE(ret < dax.size)) {
ret = -EIO;
goto unmap;
}
+ dax_mapping_entry_mkclean(mapping, index, pfn_t_to_pfn(dax.pfn));
wb_cache_pmem(dax.addr, dax.size);
-
+ /*
+ * After we have flushed the cache, we can clear the dirty tag. There
+ * cannot be new dirty data in the pfn after the flush has completed as
+ * the pfn mappings are writeprotected and fault waits for mapping
+ * entry lock.
+ */
spin_lock_irq(&mapping->tree_lock);
- radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE);
+ radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_DIRTY);
spin_unlock_irq(&mapping->tree_lock);
unmap:
dax_unmap_atomic(bdev, &dax);
+ put_locked_mapping_entry(mapping, index, entry);
return ret;
- unlock:
+ put_unlocked:
+ put_unlocked_mapping_entry(mapping, index, entry2);
spin_unlock_irq(&mapping->tree_lock);
return ret;
}
@@ -738,12 +772,11 @@ int dax_writeback_mapping_range(struct address_space *mapping,
struct block_device *bdev, struct writeback_control *wbc)
{
struct inode *inode = mapping->host;
- pgoff_t start_index, end_index, pmd_index;
+ pgoff_t start_index, end_index;
pgoff_t indices[PAGEVEC_SIZE];
struct pagevec pvec;
bool done = false;
int i, ret = 0;
- void *entry;
if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT))
return -EIO;
@@ -753,15 +786,6 @@ int dax_writeback_mapping_range(struct address_space *mapping,
start_index = wbc->range_start >> PAGE_SHIFT;
end_index = wbc->range_end >> PAGE_SHIFT;
- pmd_index = DAX_PMD_INDEX(start_index);
-
- rcu_read_lock();
- entry = radix_tree_lookup(&mapping->page_tree, pmd_index);
- rcu_read_unlock();
-
- /* see if the start of our range is covered by a PMD entry */
- if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD)
- start_index = pmd_index;
tag_pages_for_writeback(mapping, start_index, end_index);
@@ -794,7 +818,7 @@ static int dax_insert_mapping(struct address_space *mapping,
struct block_device *bdev, sector_t sector, size_t size,
void **entryp, struct vm_area_struct *vma, struct vm_fault *vmf)
{
- unsigned long vaddr = (unsigned long)vmf->virtual_address;
+ unsigned long vaddr = vmf->address;
struct blk_dax_ctl dax = {
.sector = sector,
.size = size,
@@ -806,7 +830,7 @@ static int dax_insert_mapping(struct address_space *mapping,
return PTR_ERR(dax.addr);
dax_unmap_atomic(bdev, &dax);
- ret = dax_insert_mapping_entry(mapping, vmf, entry, dax.sector);
+ ret = dax_insert_mapping_entry(mapping, vmf, entry, dax.sector, 0);
if (IS_ERR(ret))
return PTR_ERR(ret);
*entryp = ret;
@@ -815,323 +839,6 @@ static int dax_insert_mapping(struct address_space *mapping,
}
/**
- * dax_fault - handle a page fault on a DAX file
- * @vma: The virtual memory area where the fault occurred
- * @vmf: The description of the fault
- * @get_block: The filesystem method used to translate file offsets to blocks
- *
- * When a page fault occurs, filesystems may call this helper in their
- * fault handler for DAX files. dax_fault() assumes the caller has done all
- * the necessary locking for the page fault to proceed successfully.
- */
-int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
- get_block_t get_block)
-{
- struct file *file = vma->vm_file;
- struct address_space *mapping = file->f_mapping;
- struct inode *inode = mapping->host;
- void *entry;
- struct buffer_head bh;
- unsigned long vaddr = (unsigned long)vmf->virtual_address;
- unsigned blkbits = inode->i_blkbits;
- sector_t block;
- pgoff_t size;
- int error;
- int major = 0;
-
- /*
- * Check whether offset isn't beyond end of file now. Caller is supposed
- * to hold locks serializing us with truncate / punch hole so this is
- * a reliable test.
- */
- size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
- if (vmf->pgoff >= size)
- return VM_FAULT_SIGBUS;
-
- memset(&bh, 0, sizeof(bh));
- block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits);
- bh.b_bdev = inode->i_sb->s_bdev;
- bh.b_size = PAGE_SIZE;
-
- entry = grab_mapping_entry(mapping, vmf->pgoff);
- if (IS_ERR(entry)) {
- error = PTR_ERR(entry);
- goto out;
- }
-
- error = get_block(inode, block, &bh, 0);
- if (!error && (bh.b_size < PAGE_SIZE))
- error = -EIO; /* fs corruption? */
- if (error)
- goto unlock_entry;
-
- if (vmf->cow_page) {
- struct page *new_page = vmf->cow_page;
- if (buffer_written(&bh))
- error = copy_user_dax(bh.b_bdev, to_sector(&bh, inode),
- bh.b_size, new_page, vaddr);
- else
- clear_user_highpage(new_page, vaddr);
- if (error)
- goto unlock_entry;
- if (!radix_tree_exceptional_entry(entry)) {
- vmf->page = entry;
- return VM_FAULT_LOCKED;
- }
- vmf->entry = entry;
- return VM_FAULT_DAX_LOCKED;
- }
-
- if (!buffer_mapped(&bh)) {
- if (vmf->flags & FAULT_FLAG_WRITE) {
- error = get_block(inode, block, &bh, 1);
- count_vm_event(PGMAJFAULT);
- mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
- major = VM_FAULT_MAJOR;
- if (!error && (bh.b_size < PAGE_SIZE))
- error = -EIO;
- if (error)
- goto unlock_entry;
- } else {
- return dax_load_hole(mapping, entry, vmf);
- }
- }
-
- /* Filesystem should not return unwritten buffers to us! */
- WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh));
- error = dax_insert_mapping(mapping, bh.b_bdev, to_sector(&bh, inode),
- bh.b_size, &entry, vma, vmf);
- unlock_entry:
- put_locked_mapping_entry(mapping, vmf->pgoff, entry);
- out:
- if (error == -ENOMEM)
- return VM_FAULT_OOM | major;
- /* -EBUSY is fine, somebody else faulted on the same PTE */
- if ((error < 0) && (error != -EBUSY))
- return VM_FAULT_SIGBUS | major;
- return VM_FAULT_NOPAGE | major;
-}
-EXPORT_SYMBOL_GPL(dax_fault);
-
-#if defined(CONFIG_TRANSPARENT_HUGEPAGE)
-/*
- * The 'colour' (ie low bits) within a PMD of a page offset. This comes up
- * more often than one might expect in the below function.
- */
-#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1)
-
-static void __dax_dbg(struct buffer_head *bh, unsigned long address,
- const char *reason, const char *fn)
-{
- if (bh) {
- char bname[BDEVNAME_SIZE];
- bdevname(bh->b_bdev, bname);
- pr_debug("%s: %s addr: %lx dev %s state %lx start %lld "
- "length %zd fallback: %s\n", fn, current->comm,
- address, bname, bh->b_state, (u64)bh->b_blocknr,
- bh->b_size, reason);
- } else {
- pr_debug("%s: %s addr: %lx fallback: %s\n", fn,
- current->comm, address, reason);
- }
-}
-
-#define dax_pmd_dbg(bh, address, reason) __dax_dbg(bh, address, reason, "dax_pmd")
-
-/**
- * dax_pmd_fault - handle a PMD fault on a DAX file
- * @vma: The virtual memory area where the fault occurred
- * @vmf: The description of the fault
- * @get_block: The filesystem method used to translate file offsets to blocks
- *
- * When a page fault occurs, filesystems may call this helper in their
- * pmd_fault handler for DAX files.
- */
-int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
- pmd_t *pmd, unsigned int flags, get_block_t get_block)
-{
- struct file *file = vma->vm_file;
- struct address_space *mapping = file->f_mapping;
- struct inode *inode = mapping->host;
- struct buffer_head bh;
- unsigned blkbits = inode->i_blkbits;
- unsigned long pmd_addr = address & PMD_MASK;
- bool write = flags & FAULT_FLAG_WRITE;
- struct block_device *bdev;
- pgoff_t size, pgoff;
- sector_t block;
- int result = 0;
- bool alloc = false;
-
- /* dax pmd mappings require pfn_t_devmap() */
- if (!IS_ENABLED(CONFIG_FS_DAX_PMD))
- return VM_FAULT_FALLBACK;
-
- /* Fall back to PTEs if we're going to COW */
- if (write && !(vma->vm_flags & VM_SHARED)) {
- split_huge_pmd(vma, pmd, address);
- dax_pmd_dbg(NULL, address, "cow write");
- return VM_FAULT_FALLBACK;
- }
- /* If the PMD would extend outside the VMA */
- if (pmd_addr < vma->vm_start) {
- dax_pmd_dbg(NULL, address, "vma start unaligned");
- return VM_FAULT_FALLBACK;
- }
- if ((pmd_addr + PMD_SIZE) > vma->vm_end) {
- dax_pmd_dbg(NULL, address, "vma end unaligned");
- return VM_FAULT_FALLBACK;
- }
-
- pgoff = linear_page_index(vma, pmd_addr);
- size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
- if (pgoff >= size)
- return VM_FAULT_SIGBUS;
- /* If the PMD would cover blocks out of the file */
- if ((pgoff | PG_PMD_COLOUR) >= size) {
- dax_pmd_dbg(NULL, address,
- "offset + huge page size > file size");
- return VM_FAULT_FALLBACK;
- }
-
- memset(&bh, 0, sizeof(bh));
- bh.b_bdev = inode->i_sb->s_bdev;
- block = (sector_t)pgoff << (PAGE_SHIFT - blkbits);
-
- bh.b_size = PMD_SIZE;
-
- if (get_block(inode, block, &bh, 0) != 0)
- return VM_FAULT_SIGBUS;
-
- if (!buffer_mapped(&bh) && write) {
- if (get_block(inode, block, &bh, 1) != 0)
- return VM_FAULT_SIGBUS;
- alloc = true;
- WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh));
- }
-
- bdev = bh.b_bdev;
-
- /*
- * If the filesystem isn't willing to tell us the length of a hole,
- * just fall back to PTEs. Calling get_block 512 times in a loop
- * would be silly.
- */
- if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE) {
- dax_pmd_dbg(&bh, address, "allocated block too small");
- return VM_FAULT_FALLBACK;
- }
-
- /*
- * If we allocated new storage, make sure no process has any
- * zero pages covering this hole
- */
- if (alloc) {
- loff_t lstart = pgoff << PAGE_SHIFT;
- loff_t lend = lstart + PMD_SIZE - 1; /* inclusive */
-
- truncate_pagecache_range(inode, lstart, lend);
- }
-
- if (!write && !buffer_mapped(&bh)) {
- spinlock_t *ptl;
- pmd_t entry;
- struct page *zero_page = mm_get_huge_zero_page(vma->vm_mm);
-
- if (unlikely(!zero_page)) {
- dax_pmd_dbg(&bh, address, "no zero page");
- goto fallback;
- }
-
- ptl = pmd_lock(vma->vm_mm, pmd);
- if (!pmd_none(*pmd)) {
- spin_unlock(ptl);
- dax_pmd_dbg(&bh, address, "pmd already present");
- goto fallback;
- }
-
- dev_dbg(part_to_dev(bdev->bd_part),
- "%s: %s addr: %lx pfn: <zero> sect: %llx\n",
- __func__, current->comm, address,
- (unsigned long long) to_sector(&bh, inode));
-
- entry = mk_pmd(zero_page, vma->vm_page_prot);
- entry = pmd_mkhuge(entry);
- set_pmd_at(vma->vm_mm, pmd_addr, pmd, entry);
- result = VM_FAULT_NOPAGE;
- spin_unlock(ptl);
- } else {
- struct blk_dax_ctl dax = {
- .sector = to_sector(&bh, inode),
- .size = PMD_SIZE,
- };
- long length = dax_map_atomic(bdev, &dax);
-
- if (length < 0) {
- dax_pmd_dbg(&bh, address, "dax-error fallback");
- goto fallback;
- }
- if (length < PMD_SIZE) {
- dax_pmd_dbg(&bh, address, "dax-length too small");
- dax_unmap_atomic(bdev, &dax);
- goto fallback;
- }
- if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR) {
- dax_pmd_dbg(&bh, address, "pfn unaligned");
- dax_unmap_atomic(bdev, &dax);
- goto fallback;
- }
-
- if (!pfn_t_devmap(dax.pfn)) {
- dax_unmap_atomic(bdev, &dax);
- dax_pmd_dbg(&bh, address, "pfn not in memmap");
- goto fallback;
- }
- dax_unmap_atomic(bdev, &dax);
-
- /*
- * For PTE faults we insert a radix tree entry for reads, and
- * leave it clean. Then on the first write we dirty the radix
- * tree entry via the dax_pfn_mkwrite() path. This sequence
- * allows the dax_pfn_mkwrite() call to be simpler and avoid a
- * call into get_block() to translate the pgoff to a sector in
- * order to be able to create a new radix tree entry.
- *
- * The PMD path doesn't have an equivalent to
- * dax_pfn_mkwrite(), though, so for a read followed by a
- * write we traverse all the way through dax_pmd_fault()
- * twice. This means we can just skip inserting a radix tree
- * entry completely on the initial read and just wait until
- * the write to insert a dirty entry.
- */
- if (write) {
- /*
- * We should insert radix-tree entry and dirty it here.
- * For now this is broken...
- */
- }
-
- dev_dbg(part_to_dev(bdev->bd_part),
- "%s: %s addr: %lx pfn: %lx sect: %llx\n",
- __func__, current->comm, address,
- pfn_t_to_pfn(dax.pfn),
- (unsigned long long) dax.sector);
- result |= vmf_insert_pfn_pmd(vma, address, pmd,
- dax.pfn, write);
- }
-
- out:
- return result;
-
- fallback:
- count_vm_event(THP_FAULT_FALLBACK);
- result = VM_FAULT_FALLBACK;
- goto out;
-}
-EXPORT_SYMBOL_GPL(dax_pmd_fault);
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-
-/**
* dax_pfn_mkwrite - handle first write to DAX page
* @vma: The virtual memory area where the fault occurred
* @vmf: The description of the fault
@@ -1140,17 +847,27 @@ int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
{
struct file *file = vma->vm_file;
struct address_space *mapping = file->f_mapping;
- void *entry;
+ void *entry, **slot;
pgoff_t index = vmf->pgoff;
spin_lock_irq(&mapping->tree_lock);
- entry = get_unlocked_mapping_entry(mapping, index, NULL);
- if (!entry || !radix_tree_exceptional_entry(entry))
- goto out;
+ entry = get_unlocked_mapping_entry(mapping, index, &slot);
+ if (!entry || !radix_tree_exceptional_entry(entry)) {
+ if (entry)
+ put_unlocked_mapping_entry(mapping, index, entry);
+ spin_unlock_irq(&mapping->tree_lock);
+ return VM_FAULT_NOPAGE;
+ }
radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY);
- put_unlocked_mapping_entry(mapping, index, entry);
-out:
+ entry = lock_slot(mapping, slot);
spin_unlock_irq(&mapping->tree_lock);
+ /*
+ * If we race with somebody updating the PTE and finish_mkwrite_fault()
+ * fails, we don't care. We need to return VM_FAULT_NOPAGE and retry
+ * the fault in either case.
+ */
+ finish_mkwrite_fault(vmf);
+ put_locked_mapping_entry(mapping, index, entry);
return VM_FAULT_NOPAGE;
}
EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
@@ -1191,62 +908,14 @@ int __dax_zero_page_range(struct block_device *bdev, sector_t sector,
}
EXPORT_SYMBOL_GPL(__dax_zero_page_range);
-/**
- * dax_zero_page_range - zero a range within a page of a DAX file
- * @inode: The file being truncated
- * @from: The file offset that is being truncated to
- * @length: The number of bytes to zero
- * @get_block: The filesystem method used to translate file offsets to blocks
- *
- * This function can be called by a filesystem when it is zeroing part of a
- * page in a DAX file. This is intended for hole-punch operations. If
- * you are truncating a file, the helper function dax_truncate_page() may be
- * more convenient.
- */
-int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length,
- get_block_t get_block)
-{
- struct buffer_head bh;
- pgoff_t index = from >> PAGE_SHIFT;
- unsigned offset = from & (PAGE_SIZE-1);
- int err;
-
- /* Block boundary? Nothing to do */
- if (!length)
- return 0;
- BUG_ON((offset + length) > PAGE_SIZE);
-
- memset(&bh, 0, sizeof(bh));
- bh.b_bdev = inode->i_sb->s_bdev;
- bh.b_size = PAGE_SIZE;
- err = get_block(inode, index, &bh, 0);
- if (err < 0 || !buffer_written(&bh))
- return err;
-
- return __dax_zero_page_range(bh.b_bdev, to_sector(&bh, inode),
- offset, length);
-}
-EXPORT_SYMBOL_GPL(dax_zero_page_range);
-
-/**
- * dax_truncate_page - handle a partial page being truncated in a DAX file
- * @inode: The file being truncated
- * @from: The file offset that is being truncated to
- * @get_block: The filesystem method used to translate file offsets to blocks
- *
- * Similar to block_truncate_page(), this function can be called by a
- * filesystem when it is truncating a DAX file to handle the partial page.
- */
-int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block)
+#ifdef CONFIG_FS_IOMAP
+static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos)
{
- unsigned length = PAGE_ALIGN(from) - from;
- return dax_zero_page_range(inode, from, length, get_block);
+ return iomap->blkno + (((pos & PAGE_MASK) - iomap->offset) >> 9);
}
-EXPORT_SYMBOL_GPL(dax_truncate_page);
-#ifdef CONFIG_FS_IOMAP
static loff_t
-iomap_dax_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
+dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
struct iomap *iomap)
{
struct iov_iter *iter = data;
@@ -1270,8 +939,7 @@ iomap_dax_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
struct blk_dax_ctl dax = { 0 };
ssize_t map_len;
- dax.sector = iomap->blkno +
- (((pos & PAGE_MASK) - iomap->offset) >> 9);
+ dax.sector = dax_iomap_sector(iomap, pos);
dax.size = (length + offset + PAGE_SIZE - 1) & PAGE_MASK;
map_len = dax_map_atomic(iomap->bdev, &dax);
if (map_len < 0) {
@@ -1303,7 +971,7 @@ iomap_dax_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
}
/**
- * iomap_dax_rw - Perform I/O to a DAX file
+ * dax_iomap_rw - Perform I/O to a DAX file
* @iocb: The control block for this I/O
* @iter: The addresses to do I/O from or to
* @ops: iomap ops passed from the file system
@@ -1313,7 +981,7 @@ iomap_dax_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
* and evicting any page cache pages in the region under I/O.
*/
ssize_t
-iomap_dax_rw(struct kiocb *iocb, struct iov_iter *iter,
+dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
struct iomap_ops *ops)
{
struct address_space *mapping = iocb->ki_filp->f_mapping;
@@ -1343,7 +1011,7 @@ iomap_dax_rw(struct kiocb *iocb, struct iov_iter *iter,
while (iov_iter_count(iter)) {
ret = iomap_apply(inode, pos, iov_iter_count(iter), flags, ops,
- iter, iomap_dax_actor);
+ iter, dax_iomap_actor);
if (ret <= 0)
break;
pos += ret;
@@ -1353,10 +1021,10 @@ iomap_dax_rw(struct kiocb *iocb, struct iov_iter *iter,
iocb->ki_pos += done;
return done ? done : ret;
}
-EXPORT_SYMBOL_GPL(iomap_dax_rw);
+EXPORT_SYMBOL_GPL(dax_iomap_rw);
/**
- * iomap_dax_fault - handle a page fault on a DAX file
+ * dax_iomap_fault - handle a page fault on a DAX file
* @vma: The virtual memory area where the fault occurred
* @vmf: The description of the fault
* @ops: iomap ops passed from the file system
@@ -1365,17 +1033,18 @@ EXPORT_SYMBOL_GPL(iomap_dax_rw);
* or mkwrite handler for DAX files. Assumes the caller has done all the
* necessary locking for the page fault to proceed successfully.
*/
-int iomap_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
+int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
struct iomap_ops *ops)
{
struct address_space *mapping = vma->vm_file->f_mapping;
struct inode *inode = mapping->host;
- unsigned long vaddr = (unsigned long)vmf->virtual_address;
+ unsigned long vaddr = vmf->address;
loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT;
sector_t sector;
struct iomap iomap = { 0 };
- unsigned flags = 0;
+ unsigned flags = IOMAP_FAULT;
int error, major = 0;
+ int vmf_ret = 0;
void *entry;
/*
@@ -1386,7 +1055,7 @@ int iomap_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
if (pos >= i_size_read(inode))
return VM_FAULT_SIGBUS;
- entry = grab_mapping_entry(mapping, vmf->pgoff);
+ entry = grab_mapping_entry(mapping, vmf->pgoff, 0);
if (IS_ERR(entry)) {
error = PTR_ERR(entry);
goto out;
@@ -1405,10 +1074,10 @@ int iomap_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
goto unlock_entry;
if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) {
error = -EIO; /* fs corruption? */
- goto unlock_entry;
+ goto finish_iomap;
}
- sector = iomap.blkno + (((pos & PAGE_MASK) - iomap.offset) >> 9);
+ sector = dax_iomap_sector(&iomap, pos);
if (vmf->cow_page) {
switch (iomap.type) {
@@ -1427,13 +1096,13 @@ int iomap_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
}
if (error)
- goto unlock_entry;
- if (!radix_tree_exceptional_entry(entry)) {
- vmf->page = entry;
- return VM_FAULT_LOCKED;
- }
- vmf->entry = entry;
- return VM_FAULT_DAX_LOCKED;
+ goto finish_iomap;
+
+ __SetPageUptodate(vmf->cow_page);
+ vmf_ret = finish_fault(vmf);
+ if (!vmf_ret)
+ vmf_ret = VM_FAULT_DONE_COW;
+ goto finish_iomap;
}
switch (iomap.type) {
@@ -1448,8 +1117,10 @@ int iomap_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
break;
case IOMAP_UNWRITTEN:
case IOMAP_HOLE:
- if (!(vmf->flags & FAULT_FLAG_WRITE))
- return dax_load_hole(mapping, entry, vmf);
+ if (!(vmf->flags & FAULT_FLAG_WRITE)) {
+ vmf_ret = dax_load_hole(mapping, entry, vmf);
+ break;
+ }
/*FALLTHRU*/
default:
WARN_ON_ONCE(1);
@@ -1457,15 +1128,218 @@ int iomap_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
break;
}
+ finish_iomap:
+ if (ops->iomap_end) {
+ if (error || (vmf_ret & VM_FAULT_ERROR)) {
+ /* keep previous error */
+ ops->iomap_end(inode, pos, PAGE_SIZE, 0, flags,
+ &iomap);
+ } else {
+ error = ops->iomap_end(inode, pos, PAGE_SIZE,
+ PAGE_SIZE, flags, &iomap);
+ }
+ }
unlock_entry:
- put_locked_mapping_entry(mapping, vmf->pgoff, entry);
+ if (vmf_ret != VM_FAULT_LOCKED || error)
+ put_locked_mapping_entry(mapping, vmf->pgoff, entry);
out:
if (error == -ENOMEM)
return VM_FAULT_OOM | major;
/* -EBUSY is fine, somebody else faulted on the same PTE */
if (error < 0 && error != -EBUSY)
return VM_FAULT_SIGBUS | major;
+ if (vmf_ret) {
+ WARN_ON_ONCE(error); /* -EBUSY from ops->iomap_end? */
+ return vmf_ret;
+ }
return VM_FAULT_NOPAGE | major;
}
-EXPORT_SYMBOL_GPL(iomap_dax_fault);
+EXPORT_SYMBOL_GPL(dax_iomap_fault);
+
+#ifdef CONFIG_FS_DAX_PMD
+/*
+ * The 'colour' (ie low bits) within a PMD of a page offset. This comes up
+ * more often than one might expect in the below functions.
+ */
+#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1)
+
+static int dax_pmd_insert_mapping(struct vm_area_struct *vma, pmd_t *pmd,
+ struct vm_fault *vmf, unsigned long address,
+ struct iomap *iomap, loff_t pos, bool write, void **entryp)
+{
+ struct address_space *mapping = vma->vm_file->f_mapping;
+ struct block_device *bdev = iomap->bdev;
+ struct blk_dax_ctl dax = {
+ .sector = dax_iomap_sector(iomap, pos),
+ .size = PMD_SIZE,
+ };
+ long length = dax_map_atomic(bdev, &dax);
+ void *ret;
+
+ if (length < 0) /* dax_map_atomic() failed */
+ return VM_FAULT_FALLBACK;
+ if (length < PMD_SIZE)
+ goto unmap_fallback;
+ if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR)
+ goto unmap_fallback;
+ if (!pfn_t_devmap(dax.pfn))
+ goto unmap_fallback;
+
+ dax_unmap_atomic(bdev, &dax);
+
+ ret = dax_insert_mapping_entry(mapping, vmf, *entryp, dax.sector,
+ RADIX_DAX_PMD);
+ if (IS_ERR(ret))
+ return VM_FAULT_FALLBACK;
+ *entryp = ret;
+
+ return vmf_insert_pfn_pmd(vma, address, pmd, dax.pfn, write);
+
+ unmap_fallback:
+ dax_unmap_atomic(bdev, &dax);
+ return VM_FAULT_FALLBACK;
+}
+
+static int dax_pmd_load_hole(struct vm_area_struct *vma, pmd_t *pmd,
+ struct vm_fault *vmf, unsigned long address,
+ struct iomap *iomap, void **entryp)
+{
+ struct address_space *mapping = vma->vm_file->f_mapping;
+ unsigned long pmd_addr = address & PMD_MASK;
+ struct page *zero_page;
+ spinlock_t *ptl;
+ pmd_t pmd_entry;
+ void *ret;
+
+ zero_page = mm_get_huge_zero_page(vma->vm_mm);
+
+ if (unlikely(!zero_page))
+ return VM_FAULT_FALLBACK;
+
+ ret = dax_insert_mapping_entry(mapping, vmf, *entryp, 0,
+ RADIX_DAX_PMD | RADIX_DAX_HZP);
+ if (IS_ERR(ret))
+ return VM_FAULT_FALLBACK;
+ *entryp = ret;
+
+ ptl = pmd_lock(vma->vm_mm, pmd);
+ if (!pmd_none(*pmd)) {
+ spin_unlock(ptl);
+ return VM_FAULT_FALLBACK;
+ }
+
+ pmd_entry = mk_pmd(zero_page, vma->vm_page_prot);
+ pmd_entry = pmd_mkhuge(pmd_entry);
+ set_pmd_at(vma->vm_mm, pmd_addr, pmd, pmd_entry);
+ spin_unlock(ptl);
+ return VM_FAULT_NOPAGE;
+}
+
+int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmd, unsigned int flags, struct iomap_ops *ops)
+{
+ struct address_space *mapping = vma->vm_file->f_mapping;
+ unsigned long pmd_addr = address & PMD_MASK;
+ bool write = flags & FAULT_FLAG_WRITE;
+ unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT;
+ struct inode *inode = mapping->host;
+ int result = VM_FAULT_FALLBACK;
+ struct iomap iomap = { 0 };
+ pgoff_t max_pgoff, pgoff;
+ struct vm_fault vmf;
+ void *entry;
+ loff_t pos;
+ int error;
+
+ /* Fall back to PTEs if we're going to COW */
+ if (write && !(vma->vm_flags & VM_SHARED))
+ goto fallback;
+
+ /* If the PMD would extend outside the VMA */
+ if (pmd_addr < vma->vm_start)
+ goto fallback;
+ if ((pmd_addr + PMD_SIZE) > vma->vm_end)
+ goto fallback;
+
+ /*
+ * Check whether offset isn't beyond end of file now. Caller is
+ * supposed to hold locks serializing us with truncate / punch hole so
+ * this is a reliable test.
+ */
+ pgoff = linear_page_index(vma, pmd_addr);
+ max_pgoff = (i_size_read(inode) - 1) >> PAGE_SHIFT;
+
+ if (pgoff > max_pgoff)
+ return VM_FAULT_SIGBUS;
+
+ /* If the PMD would extend beyond the file size */
+ if ((pgoff | PG_PMD_COLOUR) > max_pgoff)
+ goto fallback;
+
+ /*
+ * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX
+ * PMD or a HZP entry. If it can't (because a 4k page is already in
+ * the tree, for instance), it will return -EEXIST and we just fall
+ * back to 4k entries.
+ */
+ entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD);
+ if (IS_ERR(entry))
+ goto fallback;
+
+ /*
+ * Note that we don't use iomap_apply here. We aren't doing I/O, only
+ * setting up a mapping, so really we're using iomap_begin() as a way
+ * to look up our filesystem block.
+ */
+ pos = (loff_t)pgoff << PAGE_SHIFT;
+ error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap);
+ if (error)
+ goto unlock_entry;
+ if (iomap.offset + iomap.length < pos + PMD_SIZE)
+ goto finish_iomap;
+
+ vmf.pgoff = pgoff;
+ vmf.flags = flags;
+ vmf.gfp_mask = mapping_gfp_mask(mapping) | __GFP_IO;
+
+ switch (iomap.type) {
+ case IOMAP_MAPPED:
+ result = dax_pmd_insert_mapping(vma, pmd, &vmf, address,
+ &iomap, pos, write, &entry);
+ break;
+ case IOMAP_UNWRITTEN:
+ case IOMAP_HOLE:
+ if (WARN_ON_ONCE(write))
+ goto finish_iomap;
+ result = dax_pmd_load_hole(vma, pmd, &vmf, address, &iomap,
+ &entry);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ break;
+ }
+
+ finish_iomap:
+ if (ops->iomap_end) {
+ if (result == VM_FAULT_FALLBACK) {
+ ops->iomap_end(inode, pos, PMD_SIZE, 0, iomap_flags,
+ &iomap);
+ } else {
+ error = ops->iomap_end(inode, pos, PMD_SIZE, PMD_SIZE,
+ iomap_flags, &iomap);
+ if (error)
+ result = VM_FAULT_FALLBACK;
+ }
+ }
+ unlock_entry:
+ put_locked_mapping_entry(mapping, pgoff, entry);
+ fallback:
+ if (result == VM_FAULT_FALLBACK) {
+ split_huge_pmd(vma, pmd, address);
+ count_vm_event(THP_FAULT_FALLBACK);
+ }
+ return result;
+}
+EXPORT_SYMBOL_GPL(dax_iomap_pmd_fault);
+#endif /* CONFIG_FS_DAX_PMD */
#endif /* CONFIG_FS_IOMAP */
diff --git a/fs/direct-io.c b/fs/direct-io.c
index fb9aa16a7727..aeae8c063451 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -457,7 +457,7 @@ static struct bio *dio_await_one(struct dio *dio)
dio->waiter = current;
spin_unlock_irqrestore(&dio->bio_lock, flags);
if (!(dio->iocb->ki_flags & IOCB_HIPRI) ||
- !blk_poll(bdev_get_queue(dio->bio_bdev), dio->bio_cookie))
+ !blk_mq_poll(bdev_get_queue(dio->bio_bdev), dio->bio_cookie))
io_schedule();
/* wake up sets us TASK_RUNNING */
spin_lock_irqsave(&dio->bio_lock, flags);
@@ -554,7 +554,7 @@ static inline int dio_bio_reap(struct dio *dio, struct dio_submit *sdio)
* filesystems that don't need it and also allows us to create the workqueue
* late enough so the we can include s_id in the name of the workqueue.
*/
-static int sb_init_dio_done_wq(struct super_block *sb)
+int sb_init_dio_done_wq(struct super_block *sb)
{
struct workqueue_struct *old;
struct workqueue_struct *wq = alloc_workqueue("dio/%s",
@@ -843,24 +843,6 @@ out:
}
/*
- * Clean any dirty buffers in the blockdev mapping which alias newly-created
- * file blocks. Only called for S_ISREG files - blockdevs do not set
- * buffer_new
- */
-static void clean_blockdev_aliases(struct dio *dio, struct buffer_head *map_bh)
-{
- unsigned i;
- unsigned nblocks;
-
- nblocks = map_bh->b_size >> dio->inode->i_blkbits;
-
- for (i = 0; i < nblocks; i++) {
- unmap_underlying_metadata(map_bh->b_bdev,
- map_bh->b_blocknr + i);
- }
-}
-
-/*
* If we are not writing the entire block and get_block() allocated
* the block for us, we need to fill-in the unused portion of the
* block with zeros. This happens only if user-buffer, fileoffset or
@@ -960,11 +942,15 @@ static int do_direct_IO(struct dio *dio, struct dio_submit *sdio,
goto do_holes;
sdio->blocks_available =
- map_bh->b_size >> sdio->blkbits;
+ map_bh->b_size >> blkbits;
sdio->next_block_for_io =
map_bh->b_blocknr << sdio->blkfactor;
- if (buffer_new(map_bh))
- clean_blockdev_aliases(dio, map_bh);
+ if (buffer_new(map_bh)) {
+ clean_bdev_aliases(
+ map_bh->b_bdev,
+ map_bh->b_blocknr,
+ map_bh->b_size >> blkbits);
+ }
if (!sdio->blkfactor)
goto do_holes;
@@ -1209,7 +1195,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
dio->inode = inode;
if (iov_iter_rw(iter) == WRITE) {
dio->op = REQ_OP_WRITE;
- dio->op_flags = WRITE_ODIRECT;
+ dio->op_flags = REQ_SYNC | REQ_IDLE;
} else {
dio->op = REQ_OP_READ;
}
diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
index dcea1e37a1b7..07fed838d8fd 100644
--- a/fs/dlm/ast.c
+++ b/fs/dlm/ast.c
@@ -268,7 +268,7 @@ void dlm_callback_work(struct work_struct *work)
int dlm_callback_start(struct dlm_ls *ls)
{
ls->ls_callback_wq = alloc_workqueue("dlm_callback",
- WQ_UNBOUND | WQ_MEM_RECLAIM, 0);
+ WQ_HIGHPRI | WQ_MEM_RECLAIM, 0);
if (!ls->ls_callback_wq) {
log_print("can't start dlm_callback workqueue");
return -ENOMEM;
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index df955d2209ce..7211e826d90d 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -12,7 +12,7 @@
******************************************************************************/
#include <linux/kernel.h>
-#include <linux/module.h>
+#include <linux/init.h>
#include <linux/configfs.h>
#include <linux/slab.h>
#include <linux/in.h>
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index 466f7d60edc2..ca7089aeadab 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c
@@ -12,7 +12,7 @@
#include <linux/pagemap.h>
#include <linux/seq_file.h>
-#include <linux/module.h>
+#include <linux/init.h>
#include <linux/ctype.h>
#include <linux/debugfs.h>
#include <linux/slab.h>
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index 216b61604ef9..b670f5601fbb 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -18,7 +18,6 @@
* This is the main header file to be included in each DLM source file.
*/
-#include <linux/module.h>
#include <linux/slab.h>
#include <linux/sched.h>
#include <linux/types.h>
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index f3e72787e7f9..91592b75c309 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -11,6 +11,8 @@
*******************************************************************************
******************************************************************************/
+#include <linux/module.h>
+
#include "dlm_internal.h"
#include "lockspace.h"
#include "member.h"
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 609998de533e..7d398d300e97 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -519,29 +519,25 @@ out:
/* Note: sk_callback_lock must be locked before calling this function. */
static void save_callbacks(struct connection *con, struct sock *sk)
{
- lock_sock(sk);
con->orig_data_ready = sk->sk_data_ready;
con->orig_state_change = sk->sk_state_change;
con->orig_write_space = sk->sk_write_space;
con->orig_error_report = sk->sk_error_report;
- release_sock(sk);
}
static void restore_callbacks(struct connection *con, struct sock *sk)
{
write_lock_bh(&sk->sk_callback_lock);
- lock_sock(sk);
sk->sk_user_data = NULL;
sk->sk_data_ready = con->orig_data_ready;
sk->sk_state_change = con->orig_state_change;
sk->sk_write_space = con->orig_write_space;
sk->sk_error_report = con->orig_error_report;
- release_sock(sk);
write_unlock_bh(&sk->sk_callback_lock);
}
/* Make a socket active */
-static void add_sock(struct socket *sock, struct connection *con)
+static void add_sock(struct socket *sock, struct connection *con, bool save_cb)
{
struct sock *sk = sock->sk;
@@ -549,7 +545,7 @@ static void add_sock(struct socket *sock, struct connection *con)
con->sock = sock;
sk->sk_user_data = con;
- if (!test_bit(CF_IS_OTHERCON, &con->flags))
+ if (save_cb)
save_callbacks(con, sk);
/* Install a data_ready callback */
sk->sk_data_ready = lowcomms_data_ready;
@@ -806,7 +802,7 @@ static int tcp_accept_from_sock(struct connection *con)
newcon->othercon = othercon;
othercon->sock = newsock;
newsock->sk->sk_user_data = othercon;
- add_sock(newsock, othercon);
+ add_sock(newsock, othercon, false);
addcon = othercon;
}
else {
@@ -819,7 +815,10 @@ static int tcp_accept_from_sock(struct connection *con)
else {
newsock->sk->sk_user_data = newcon;
newcon->rx_action = receive_from_sock;
- add_sock(newsock, newcon);
+ /* accept copies the sk after we've saved the callbacks, so we
+ don't want to save them a second time or comm errors will
+ result in calling sk_error_report recursively. */
+ add_sock(newsock, newcon, false);
addcon = newcon;
}
@@ -880,7 +879,8 @@ static int sctp_accept_from_sock(struct connection *con)
}
make_sockaddr(&prim.ssp_addr, 0, &addr_len);
- if (addr_to_nodeid(&prim.ssp_addr, &nodeid)) {
+ ret = addr_to_nodeid(&prim.ssp_addr, &nodeid);
+ if (ret) {
unsigned char *b = (unsigned char *)&prim.ssp_addr;
log_print("reject connect from unknown addr");
@@ -919,7 +919,7 @@ static int sctp_accept_from_sock(struct connection *con)
newcon->othercon = othercon;
othercon->sock = newsock;
newsock->sk->sk_user_data = othercon;
- add_sock(newsock, othercon);
+ add_sock(newsock, othercon, false);
addcon = othercon;
} else {
printk("Extra connection from node %d attempted\n", nodeid);
@@ -930,7 +930,7 @@ static int sctp_accept_from_sock(struct connection *con)
} else {
newsock->sk->sk_user_data = newcon;
newcon->rx_action = receive_from_sock;
- add_sock(newsock, newcon);
+ add_sock(newsock, newcon, false);
addcon = newcon;
}
@@ -1058,7 +1058,7 @@ static void sctp_connect_to_sock(struct connection *con)
sock->sk->sk_user_data = con;
con->rx_action = receive_from_sock;
con->connect_action = sctp_connect_to_sock;
- add_sock(sock, con);
+ add_sock(sock, con, true);
/* Bind to all addresses. */
if (sctp_bind_addrs(con, 0))
@@ -1146,7 +1146,7 @@ static void tcp_connect_to_sock(struct connection *con)
sock->sk->sk_user_data = con;
con->rx_action = receive_from_sock;
con->connect_action = tcp_connect_to_sock;
- add_sock(sock, con);
+ add_sock(sock, con, true);
/* Bind to our cluster-known address connecting to avoid
routing problems */
@@ -1366,7 +1366,7 @@ static int tcp_listen_for_all(void)
sock = tcp_create_listen_sock(con, dlm_local_addr[0]);
if (sock) {
- add_sock(sock, con);
+ add_sock(sock, con, true);
result = 0;
}
else {
diff --git a/fs/dlm/main.c b/fs/dlm/main.c
index 079c0bd71ab7..8e1b618891be 100644
--- a/fs/dlm/main.c
+++ b/fs/dlm/main.c
@@ -11,6 +11,8 @@
*******************************************************************************
******************************************************************************/
+#include <linux/module.h>
+
#include "dlm_internal.h"
#include "lockspace.h"
#include "lock.h"
diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c
index 1e6e227134d7..43a96c330570 100644
--- a/fs/dlm/netlink.c
+++ b/fs/dlm/netlink.c
@@ -16,11 +16,7 @@
static uint32_t dlm_nl_seqnum;
static uint32_t listener_nlportid;
-static struct genl_family family = {
- .id = GENL_ID_GENERATE,
- .name = DLM_GENL_NAME,
- .version = DLM_GENL_VERSION,
-};
+static struct genl_family family;
static int prepare_data(u8 cmd, struct sk_buff **skbp, size_t size)
{
@@ -69,16 +65,24 @@ static int user_cmd(struct sk_buff *skb, struct genl_info *info)
return 0;
}
-static struct genl_ops dlm_nl_ops[] = {
+static const struct genl_ops dlm_nl_ops[] = {
{
.cmd = DLM_CMD_HELLO,
.doit = user_cmd,
},
};
+static struct genl_family family __ro_after_init = {
+ .name = DLM_GENL_NAME,
+ .version = DLM_GENL_VERSION,
+ .ops = dlm_nl_ops,
+ .n_ops = ARRAY_SIZE(dlm_nl_ops),
+ .module = THIS_MODULE,
+};
+
int __init dlm_netlink_init(void)
{
- return genl_register_family_with_ops(&family, dlm_nl_ops);
+ return genl_register_family(&family);
}
void dlm_netlink_exit(void)
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index 58c2f4a21b7f..1ce908c2232c 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -9,7 +9,6 @@
#include <linux/miscdevice.h>
#include <linux/init.h>
#include <linux/wait.h>
-#include <linux/module.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/poll.h>
diff --git a/fs/exec.c b/fs/exec.c
index 4e497b9ee71e..8112eacf10f3 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -209,7 +209,7 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
* doing the exec and bprm->mm is the new process's mm.
*/
ret = get_user_pages_remote(current, bprm->mm, pos, 1, gup_flags,
- &page, NULL);
+ &page, NULL, NULL);
if (ret <= 0)
return NULL;
@@ -1169,8 +1169,10 @@ no_thread_group:
/* we have changed execution domain */
tsk->exit_signal = SIGCHLD;
+#ifdef CONFIG_POSIX_TIMERS
exit_itimers(sig);
flush_itimer_signals();
+#endif
if (atomic_read(&oldsighand->count) != 1) {
struct sighand_struct *newsighand;
@@ -1275,8 +1277,22 @@ EXPORT_SYMBOL(flush_old_exec);
void would_dump(struct linux_binprm *bprm, struct file *file)
{
- if (inode_permission(file_inode(file), MAY_READ) < 0)
+ struct inode *inode = file_inode(file);
+ if (inode_permission(inode, MAY_READ) < 0) {
+ struct user_namespace *old, *user_ns;
bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP;
+
+ /* Ensure mm->user_ns contains the executable */
+ user_ns = old = bprm->mm->user_ns;
+ while ((user_ns != &init_user_ns) &&
+ !privileged_wrt_inode_uidgid(user_ns, inode))
+ user_ns = user_ns->parent;
+
+ if (old != user_ns) {
+ bprm->mm->user_ns = get_user_ns(user_ns);
+ put_user_ns(old);
+ }
+ }
}
EXPORT_SYMBOL(would_dump);
@@ -1306,7 +1322,6 @@ void setup_new_exec(struct linux_binprm * bprm)
!gid_eq(bprm->cred->gid, current_egid())) {
current->pdeath_signal = 0;
} else {
- would_dump(bprm, bprm->file);
if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP)
set_dumpable(current->mm, suid_dumpable);
}
@@ -1406,7 +1421,7 @@ static void check_unsafe_exec(struct linux_binprm *bprm)
unsigned n_fs;
if (p->ptrace) {
- if (p->ptrace & PT_PTRACE_CAP)
+ if (ptracer_capable(p, current_user_ns()))
bprm->unsafe |= LSM_UNSAFE_PTRACE_CAP;
else
bprm->unsafe |= LSM_UNSAFE_PTRACE;
@@ -1741,6 +1756,8 @@ static int do_execveat_common(int fd, struct filename *filename,
if (retval < 0)
goto out;
+ would_dump(bprm, bprm->file);
+
retval = exec_binprm(bprm);
if (retval < 0)
goto out;
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index a0e1478dfd04..b0f241528a30 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -38,7 +38,7 @@ static ssize_t ext2_dax_read_iter(struct kiocb *iocb, struct iov_iter *to)
return 0; /* skip atime */
inode_lock_shared(inode);
- ret = iomap_dax_rw(iocb, to, &ext2_iomap_ops);
+ ret = dax_iomap_rw(iocb, to, &ext2_iomap_ops);
inode_unlock_shared(inode);
file_accessed(iocb->ki_filp);
@@ -62,7 +62,7 @@ static ssize_t ext2_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
if (ret)
goto out_unlock;
- ret = iomap_dax_rw(iocb, from, &ext2_iomap_ops);
+ ret = dax_iomap_rw(iocb, from, &ext2_iomap_ops);
if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
i_size_write(inode, iocb->ki_pos);
mark_inode_dirty(inode);
@@ -99,7 +99,7 @@ static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
}
down_read(&ei->dax_sem);
- ret = iomap_dax_fault(vma, vmf, &ext2_iomap_ops);
+ ret = dax_iomap_fault(vma, vmf, &ext2_iomap_ops);
up_read(&ei->dax_sem);
if (vmf->flags & FAULT_FLAG_WRITE)
@@ -107,27 +107,6 @@ static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
return ret;
}
-static int ext2_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
- pmd_t *pmd, unsigned int flags)
-{
- struct inode *inode = file_inode(vma->vm_file);
- struct ext2_inode_info *ei = EXT2_I(inode);
- int ret;
-
- if (flags & FAULT_FLAG_WRITE) {
- sb_start_pagefault(inode->i_sb);
- file_update_time(vma->vm_file);
- }
- down_read(&ei->dax_sem);
-
- ret = dax_pmd_fault(vma, addr, pmd, flags, ext2_get_block);
-
- up_read(&ei->dax_sem);
- if (flags & FAULT_FLAG_WRITE)
- sb_end_pagefault(inode->i_sb);
- return ret;
-}
-
static int ext2_dax_pfn_mkwrite(struct vm_area_struct *vma,
struct vm_fault *vmf)
{
@@ -154,7 +133,11 @@ static int ext2_dax_pfn_mkwrite(struct vm_area_struct *vma,
static const struct vm_operations_struct ext2_dax_vm_ops = {
.fault = ext2_dax_fault,
- .pmd_fault = ext2_dax_pmd_fault,
+ /*
+ * .pmd_fault is not supported for DAX because allocation in ext2
+ * cannot be reliably aligned to huge page sizes and so pmd faults
+ * will always fail and fail back to regular faults.
+ */
.page_mkwrite = ext2_dax_fault,
.pfn_mkwrite = ext2_dax_pfn_mkwrite,
};
@@ -166,7 +149,7 @@ static int ext2_file_mmap(struct file *file, struct vm_area_struct *vma)
file_accessed(file);
vma->vm_ops = &ext2_dax_vm_ops;
- vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
+ vma->vm_flags |= VM_MIXEDMAP;
return 0;
}
#else
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 41b8b44a391c..e173afe92661 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -732,16 +732,13 @@ static int ext2_get_blocks(struct inode *inode,
}
if (IS_DAX(inode)) {
- int i;
-
/*
* We must unmap blocks before zeroing so that writeback cannot
* overwrite zeros with stale data from block device page cache.
*/
- for (i = 0; i < count; i++) {
- unmap_underlying_metadata(inode->i_sb->s_bdev,
- le32_to_cpu(chain[depth-1].key) + i);
- }
+ clean_bdev_aliases(inode->i_sb->s_bdev,
+ le32_to_cpu(chain[depth-1].key),
+ count);
/*
* block must be initialised before we put it in the tree
* so that it's not found by another thread before it's
@@ -850,6 +847,9 @@ struct iomap_ops ext2_iomap_ops = {
.iomap_begin = ext2_iomap_begin,
.iomap_end = ext2_iomap_end,
};
+#else
+/* Define empty ops for !CONFIG_FS_DAX case to avoid ugly ifdefs */
+struct iomap_ops ext2_iomap_ops;
#endif /* CONFIG_FS_DAX */
int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
@@ -1293,9 +1293,11 @@ static int ext2_setsize(struct inode *inode, loff_t newsize)
inode_dio_wait(inode);
- if (IS_DAX(inode))
- error = dax_truncate_page(inode, newsize, ext2_get_block);
- else if (test_opt(inode->i_sb, NOBH))
+ if (IS_DAX(inode)) {
+ error = iomap_zero_range(inode, newsize,
+ PAGE_ALIGN(newsize) - newsize, NULL,
+ &ext2_iomap_ops);
+ } else if (test_opt(inode->i_sb, NOBH))
error = nobh_truncate_page(inode->i_mapping,
newsize, ext2_get_block);
else
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index e38039fd96ff..7b90691e98c4 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -37,6 +37,7 @@ config EXT4_FS
select CRC16
select CRYPTO
select CRYPTO_CRC32C
+ select FS_IOMAP if FS_DAX
help
This is the next generation of the ext3 filesystem.
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index dfa519979038..fd389935ecd1 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -196,7 +196,7 @@ __ext4_set_acl(handle_t *handle, struct inode *inode, int type,
error = posix_acl_update_mode(inode, &inode->i_mode, &acl);
if (error)
return error;
- inode->i_ctime = ext4_current_time(inode);
+ inode->i_ctime = current_time(inode);
ext4_mark_inode_dirty(handle, inode);
}
break;
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index a8a750f59621..2163c1e69f2a 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -397,8 +397,9 @@ struct flex_groups {
#define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */
#define EXT4_FL_USER_VISIBLE 0x304BDFFF /* User visible flags */
-#define EXT4_FL_USER_MODIFIABLE 0x204380FF /* User modifiable flags */
+#define EXT4_FL_USER_MODIFIABLE 0x204BC0FF /* User modifiable flags */
+/* Flags we can manipulate with through EXT4_IOC_FSSETXATTR */
#define EXT4_FL_XFLAG_VISIBLE (EXT4_SYNC_FL | \
EXT4_IMMUTABLE_FL | \
EXT4_APPEND_FL | \
@@ -1533,12 +1534,6 @@ static inline struct ext4_inode_info *EXT4_I(struct inode *inode)
return container_of(inode, struct ext4_inode_info, vfs_inode);
}
-static inline struct timespec ext4_current_time(struct inode *inode)
-{
- return (inode->i_sb->s_time_gran < NSEC_PER_SEC) ?
- current_fs_time(inode->i_sb) : CURRENT_TIME_SEC;
-}
-
static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
{
return ino == EXT4_ROOT_INO ||
@@ -2277,11 +2272,6 @@ extern unsigned ext4_free_clusters_after_init(struct super_block *sb,
struct ext4_group_desc *gdp);
ext4_fsblk_t ext4_inode_to_goal_block(struct inode *);
-static inline int ext4_sb_has_crypto(struct super_block *sb)
-{
- return ext4_has_feature_encrypt(sb);
-}
-
static inline bool ext4_encrypted_inode(struct inode *inode)
{
return ext4_test_inode_flag(inode, EXT4_INODE_ENCRYPT);
@@ -2339,8 +2329,8 @@ static inline void ext4_fname_free_filename(struct ext4_filename *fname) { }
#define fscrypt_pullback_bio_page fscrypt_notsupp_pullback_bio_page
#define fscrypt_restore_control_page fscrypt_notsupp_restore_control_page
#define fscrypt_zeroout_range fscrypt_notsupp_zeroout_range
-#define fscrypt_process_policy fscrypt_notsupp_process_policy
-#define fscrypt_get_policy fscrypt_notsupp_get_policy
+#define fscrypt_ioctl_set_policy fscrypt_notsupp_ioctl_set_policy
+#define fscrypt_ioctl_get_policy fscrypt_notsupp_ioctl_get_policy
#define fscrypt_has_permitted_context fscrypt_notsupp_has_permitted_context
#define fscrypt_inherit_context fscrypt_notsupp_inherit_context
#define fscrypt_get_encryption_info fscrypt_notsupp_get_encryption_info
@@ -2458,8 +2448,6 @@ struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int);
struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int);
int ext4_get_block_unwritten(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create);
-int ext4_dax_get_block(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int create);
int ext4_get_block(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create);
int ext4_dio_get_block(struct inode *inode, sector_t iblock,
@@ -2492,7 +2480,7 @@ extern int ext4_change_inode_journal_flag(struct inode *, int);
extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
extern int ext4_inode_attach_jinode(struct inode *inode);
extern int ext4_can_truncate(struct inode *inode);
-extern void ext4_truncate(struct inode *);
+extern int ext4_truncate(struct inode *);
extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length);
extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks);
extern void ext4_set_inode_flags(struct inode *);
@@ -3129,7 +3117,7 @@ extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
extern int ext4_ext_index_trans_blocks(struct inode *inode, int extents);
extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map, int flags);
-extern void ext4_ext_truncate(handle_t *, struct inode *);
+extern int ext4_ext_truncate(handle_t *, struct inode *);
extern int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
ext4_lblk_t end);
extern void ext4_ext_init(struct super_block *);
@@ -3265,12 +3253,7 @@ static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
}
}
-static inline bool ext4_aligned_io(struct inode *inode, loff_t off, loff_t len)
-{
- int blksize = 1 << inode->i_blkbits;
-
- return IS_ALIGNED(off, blksize) && IS_ALIGNED(len, blksize);
-}
+extern struct iomap_ops ext4_iomap_ops;
#endif /* __KERNEL__ */
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index b1d52c14098e..f97611171023 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -414,17 +414,19 @@ static inline int ext4_inode_journal_mode(struct inode *inode)
return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */
/* We do not support data journalling with delayed allocation */
if (!S_ISREG(inode->i_mode) ||
- test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
- return EXT4_INODE_JOURNAL_DATA_MODE; /* journal data */
- if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) &&
- !test_opt(inode->i_sb, DELALLOC))
+ test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
+ (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) &&
+ !test_opt(inode->i_sb, DELALLOC))) {
+ /* We do not support data journalling for encrypted data */
+ if (S_ISREG(inode->i_mode) && ext4_encrypted_inode(inode))
+ return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */
return EXT4_INODE_JOURNAL_DATA_MODE; /* journal data */
+ }
if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */
if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */
- else
- BUG();
+ BUG();
}
static inline int ext4_should_journal_data(struct inode *inode)
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index c930a0110fb4..b1f8416923ab 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3777,14 +3777,6 @@ out:
return err;
}
-static void unmap_underlying_metadata_blocks(struct block_device *bdev,
- sector_t block, int count)
-{
- int i;
- for (i = 0; i < count; i++)
- unmap_underlying_metadata(bdev, block + i);
-}
-
/*
* Handle EOFBLOCKS_FL flag, clearing it if necessary
*/
@@ -4121,9 +4113,8 @@ out:
* new.
*/
if (allocated > map->m_len) {
- unmap_underlying_metadata_blocks(inode->i_sb->s_bdev,
- newblock + map->m_len,
- allocated - map->m_len);
+ clean_bdev_aliases(inode->i_sb->s_bdev, newblock + map->m_len,
+ allocated - map->m_len);
allocated = map->m_len;
}
map->m_len = allocated;
@@ -4631,7 +4622,7 @@ out2:
return err ? err : allocated;
}
-void ext4_ext_truncate(handle_t *handle, struct inode *inode)
+int ext4_ext_truncate(handle_t *handle, struct inode *inode)
{
struct super_block *sb = inode->i_sb;
ext4_lblk_t last_block;
@@ -4645,7 +4636,9 @@ void ext4_ext_truncate(handle_t *handle, struct inode *inode)
/* we have to know where to truncate from in crash case */
EXT4_I(inode)->i_disksize = inode->i_size;
- ext4_mark_inode_dirty(handle, inode);
+ err = ext4_mark_inode_dirty(handle, inode);
+ if (err)
+ return err;
last_block = (inode->i_size + sb->s_blocksize - 1)
>> EXT4_BLOCK_SIZE_BITS(sb);
@@ -4657,12 +4650,9 @@ retry:
congestion_wait(BLK_RW_ASYNC, HZ/50);
goto retry;
}
- if (err) {
- ext4_std_error(inode->i_sb, err);
- return;
- }
- err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1);
- ext4_std_error(inode->i_sb, err);
+ if (err)
+ return err;
+ return ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1);
}
static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
@@ -4701,7 +4691,7 @@ retry:
/*
* Recalculate credits when extent tree depth changes.
*/
- if (depth >= 0 && depth != ext_depth(inode)) {
+ if (depth != ext_depth(inode)) {
credits = ext4_chunk_trans_blocks(inode, len);
depth = ext_depth(inode);
}
@@ -4725,7 +4715,7 @@ retry:
map.m_lblk += ret;
map.m_len = len = len - ret;
epos = (loff_t)map.m_lblk << inode->i_blkbits;
- inode->i_ctime = ext4_current_time(inode);
+ inode->i_ctime = current_time(inode);
if (new_size) {
if (epos > new_size)
epos = new_size;
@@ -4853,7 +4843,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,
}
/* Now release the pages and zero block aligned part of pages */
truncate_pagecache_range(inode, start, end - 1);
- inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+ inode->i_mtime = inode->i_ctime = current_time(inode);
ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
flags, mode);
@@ -4878,7 +4868,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,
goto out_dio;
}
- inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+ inode->i_mtime = inode->i_ctime = current_time(inode);
if (new_size) {
ext4_update_inode_size(inode, new_size);
} else {
@@ -5568,7 +5558,7 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
up_write(&EXT4_I(inode)->i_data_sem);
if (IS_SYNC(inode))
ext4_handle_sync(handle);
- inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+ inode->i_mtime = inode->i_ctime = current_time(inode);
ext4_mark_inode_dirty(handle, inode);
out_stop:
@@ -5678,7 +5668,7 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
/* Expand file to avoid data loss if there is error while shifting */
inode->i_size += len;
EXT4_I(inode)->i_disksize += len;
- inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+ inode->i_mtime = inode->i_ctime = current_time(inode);
ret = ext4_mark_inode_dirty(handle, inode);
if (ret)
goto out_stop;
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 2a822d30e73f..b5f184493c57 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -31,6 +31,42 @@
#include "xattr.h"
#include "acl.h"
+#ifdef CONFIG_FS_DAX
+static ssize_t ext4_dax_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+ ssize_t ret;
+
+ inode_lock_shared(inode);
+ /*
+ * Recheck under inode lock - at this point we are sure it cannot
+ * change anymore
+ */
+ if (!IS_DAX(inode)) {
+ inode_unlock_shared(inode);
+ /* Fallback to buffered IO in case we cannot support DAX */
+ return generic_file_read_iter(iocb, to);
+ }
+ ret = dax_iomap_rw(iocb, to, &ext4_iomap_ops);
+ inode_unlock_shared(inode);
+
+ file_accessed(iocb->ki_filp);
+ return ret;
+}
+#endif
+
+static ssize_t ext4_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+ if (!iov_iter_count(to))
+ return 0; /* skip atime */
+
+#ifdef CONFIG_FS_DAX
+ if (IS_DAX(file_inode(iocb->ki_filp)))
+ return ext4_dax_read_iter(iocb, to);
+#endif
+ return generic_file_read_iter(iocb, to);
+}
+
/*
* Called when an inode is released. Note that this is different
* from ext4_file_open: open gets called at every open, but release
@@ -88,6 +124,86 @@ ext4_unaligned_aio(struct inode *inode, struct iov_iter *from, loff_t pos)
return 0;
}
+/* Is IO overwriting allocated and initialized blocks? */
+static bool ext4_overwrite_io(struct inode *inode, loff_t pos, loff_t len)
+{
+ struct ext4_map_blocks map;
+ unsigned int blkbits = inode->i_blkbits;
+ int err, blklen;
+
+ if (pos + len > i_size_read(inode))
+ return false;
+
+ map.m_lblk = pos >> blkbits;
+ map.m_len = EXT4_MAX_BLOCKS(len, pos, blkbits);
+ blklen = map.m_len;
+
+ err = ext4_map_blocks(NULL, inode, &map, 0);
+ /*
+ * 'err==len' means that all of the blocks have been preallocated,
+ * regardless of whether they have been initialized or not. To exclude
+ * unwritten extents, we need to check m_flags.
+ */
+ return err == blklen && (map.m_flags & EXT4_MAP_MAPPED);
+}
+
+static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+ ssize_t ret;
+
+ ret = generic_write_checks(iocb, from);
+ if (ret <= 0)
+ return ret;
+ /*
+ * If we have encountered a bitmap-format file, the size limit
+ * is smaller than s_maxbytes, which is for extent-mapped files.
+ */
+ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+
+ if (iocb->ki_pos >= sbi->s_bitmap_maxbytes)
+ return -EFBIG;
+ iov_iter_truncate(from, sbi->s_bitmap_maxbytes - iocb->ki_pos);
+ }
+ return iov_iter_count(from);
+}
+
+#ifdef CONFIG_FS_DAX
+static ssize_t
+ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+ ssize_t ret;
+ bool overwrite = false;
+
+ inode_lock(inode);
+ ret = ext4_write_checks(iocb, from);
+ if (ret <= 0)
+ goto out;
+ ret = file_remove_privs(iocb->ki_filp);
+ if (ret)
+ goto out;
+ ret = file_update_time(iocb->ki_filp);
+ if (ret)
+ goto out;
+
+ if (ext4_overwrite_io(inode, iocb->ki_pos, iov_iter_count(from))) {
+ overwrite = true;
+ downgrade_write(&inode->i_rwsem);
+ }
+ ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops);
+out:
+ if (!overwrite)
+ inode_unlock(inode);
+ else
+ inode_unlock_shared(inode);
+ if (ret > 0)
+ ret = generic_write_sync(iocb, ret);
+ return ret;
+}
+#endif
+
static ssize_t
ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
@@ -97,8 +213,13 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
int overwrite = 0;
ssize_t ret;
+#ifdef CONFIG_FS_DAX
+ if (IS_DAX(inode))
+ return ext4_dax_write_iter(iocb, from);
+#endif
+
inode_lock(inode);
- ret = generic_write_checks(iocb, from);
+ ret = ext4_write_checks(iocb, from);
if (ret <= 0)
goto out;
@@ -114,53 +235,11 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
ext4_unwritten_wait(inode);
}
- /*
- * If we have encountered a bitmap-format file, the size limit
- * is smaller than s_maxbytes, which is for extent-mapped files.
- */
- if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
-
- if (iocb->ki_pos >= sbi->s_bitmap_maxbytes) {
- ret = -EFBIG;
- goto out;
- }
- iov_iter_truncate(from, sbi->s_bitmap_maxbytes - iocb->ki_pos);
- }
-
iocb->private = &overwrite;
- if (o_direct) {
- size_t length = iov_iter_count(from);
- loff_t pos = iocb->ki_pos;
-
- /* check whether we do a DIO overwrite or not */
- if (ext4_should_dioread_nolock(inode) && !unaligned_aio &&
- pos + length <= i_size_read(inode)) {
- struct ext4_map_blocks map;
- unsigned int blkbits = inode->i_blkbits;
- int err, len;
-
- map.m_lblk = pos >> blkbits;
- map.m_len = EXT4_MAX_BLOCKS(length, pos, blkbits);
- len = map.m_len;
-
- err = ext4_map_blocks(NULL, inode, &map, 0);
- /*
- * 'err==len' means that all of blocks has
- * been preallocated no matter they are
- * initialized or not. For excluding
- * unwritten extents, we need to check
- * m_flags. There are two conditions that
- * indicate for initialized extents. 1) If we
- * hit extent cache, EXT4_MAP_MAPPED flag is
- * returned; 2) If we do a real lookup,
- * non-flags are returned. So we should check
- * these two conditions.
- */
- if (err == len && (map.m_flags & EXT4_MAP_MAPPED))
- overwrite = 1;
- }
- }
+ /* Check whether we do a DIO overwrite or not */
+ if (o_direct && ext4_should_dioread_nolock(inode) && !unaligned_aio &&
+ ext4_overwrite_io(inode, iocb->ki_pos, iov_iter_count(from)))
+ overwrite = 1;
ret = __generic_file_write_iter(iocb, from);
inode_unlock(inode);
@@ -196,7 +275,7 @@ static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
if (IS_ERR(handle))
result = VM_FAULT_SIGBUS;
else
- result = dax_fault(vma, vmf, ext4_dax_get_block);
+ result = dax_iomap_fault(vma, vmf, &ext4_iomap_ops);
if (write) {
if (!IS_ERR(handle))
@@ -230,9 +309,10 @@ static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
if (IS_ERR(handle))
result = VM_FAULT_SIGBUS;
- else
- result = dax_pmd_fault(vma, addr, pmd, flags,
- ext4_dax_get_block);
+ else {
+ result = dax_iomap_pmd_fault(vma, addr, pmd, flags,
+ &ext4_iomap_ops);
+ }
if (write) {
if (!IS_ERR(handle))
@@ -687,7 +767,7 @@ loff_t ext4_llseek(struct file *file, loff_t offset, int whence)
const struct file_operations ext4_file_operations = {
.llseek = ext4_llseek,
- .read_iter = generic_file_read_iter,
+ .read_iter = ext4_file_read_iter,
.write_iter = ext4_file_write_iter,
.unlocked_ioctl = ext4_ioctl,
#ifdef CONFIG_COMPAT
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 170421edfdfe..e57e8d90ea54 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -1039,7 +1039,7 @@ got:
/* This is the optimal IO size (for stat), not the fs block size */
inode->i_blocks = 0;
inode->i_mtime = inode->i_atime = inode->i_ctime = ei->i_crtime =
- ext4_current_time(inode);
+ current_time(inode);
memset(ei->i_data, 0, sizeof(ei->i_data));
ei->i_dir_start_lookup = 0;
@@ -1115,8 +1115,7 @@ got:
}
if (encrypt) {
- /* give pointer to avoid set_context with journal ops. */
- err = fscrypt_inherit_context(dir, inode, &encrypt, true);
+ err = fscrypt_inherit_context(dir, inode, handle, true);
if (err)
goto fail_free_drop;
}
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index f74d5ee2cdec..437df6a1a841 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -299,6 +299,11 @@ static int ext4_create_inline_data(handle_t *handle,
EXT4_I(inode)->i_inline_size = len + EXT4_MIN_INLINE_DATA_SIZE;
ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS);
ext4_set_inode_flag(inode, EXT4_INODE_INLINE_DATA);
+ /*
+ * Propagate changes to inode->i_flags as well - e.g. S_DAX may
+ * get cleared
+ */
+ ext4_set_inode_flags(inode);
get_bh(is.iloc.bh);
error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
@@ -336,8 +341,10 @@ static int ext4_update_inline_data(handle_t *handle, struct inode *inode,
len -= EXT4_MIN_INLINE_DATA_SIZE;
value = kzalloc(len, GFP_NOFS);
- if (!value)
+ if (!value) {
+ error = -ENOMEM;
goto out;
+ }
error = ext4_xattr_ibody_get(inode, i.name_index, i.name,
value, len);
@@ -442,6 +449,11 @@ static int ext4_destroy_inline_data_nolock(handle_t *handle,
}
}
ext4_clear_inode_flag(inode, EXT4_INODE_INLINE_DATA);
+ /*
+ * Propagate changes to inode->i_flags as well - e.g. S_DAX may
+ * get set.
+ */
+ ext4_set_inode_flags(inode);
get_bh(is.iloc.bh);
error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
@@ -1028,7 +1040,7 @@ static int ext4_add_dirent_to_inline(handle_t *handle,
* happen is that the times are slightly out of date
* and/or different from the directory change time.
*/
- dir->i_mtime = dir->i_ctime = ext4_current_time(dir);
+ dir->i_mtime = dir->i_ctime = current_time(dir);
ext4_update_dx_flag(dir);
dir->i_version++;
ext4_mark_inode_dirty(handle, dir);
@@ -1971,7 +1983,7 @@ out:
if (inode->i_nlink)
ext4_orphan_del(handle, inode);
- inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+ inode->i_mtime = inode->i_ctime = current_time(inode);
ext4_mark_inode_dirty(handle, inode);
if (IS_SYNC(inode))
ext4_handle_sync(handle);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 9c064727ed62..88d57af1b516 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -37,6 +37,7 @@
#include <linux/printk.h>
#include <linux/slab.h>
#include <linux/bitops.h>
+#include <linux/iomap.h>
#include "ext4_jbd2.h"
#include "xattr.h"
@@ -71,10 +72,9 @@ static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw,
csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum,
csum_size);
offset += csum_size;
- csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset,
- EXT4_INODE_SIZE(inode->i_sb) -
- offset);
}
+ csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset,
+ EXT4_INODE_SIZE(inode->i_sb) - offset);
}
return csum;
@@ -261,8 +261,15 @@ void ext4_evict_inode(struct inode *inode)
"couldn't mark inode dirty (err %d)", err);
goto stop_handle;
}
- if (inode->i_blocks)
- ext4_truncate(inode);
+ if (inode->i_blocks) {
+ err = ext4_truncate(inode);
+ if (err) {
+ ext4_error(inode->i_sb,
+ "couldn't truncate inode %lu (err %d)",
+ inode->i_ino, err);
+ goto stop_handle;
+ }
+ }
/*
* ext4_ext_truncate() doesn't reserve any slop when it
@@ -654,12 +661,8 @@ found:
if (flags & EXT4_GET_BLOCKS_ZERO &&
map->m_flags & EXT4_MAP_MAPPED &&
map->m_flags & EXT4_MAP_NEW) {
- ext4_lblk_t i;
-
- for (i = 0; i < map->m_len; i++) {
- unmap_underlying_metadata(inode->i_sb->s_bdev,
- map->m_pblk + i);
- }
+ clean_bdev_aliases(inode->i_sb->s_bdev, map->m_pblk,
+ map->m_len);
ret = ext4_issue_zeroout(inode, map->m_lblk,
map->m_pblk, map->m_len);
if (ret) {
@@ -767,6 +770,9 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,
ext4_update_bh_state(bh, map.m_flags);
bh->b_size = inode->i_sb->s_blocksize * map.m_len;
ret = 0;
+ } else if (ret == 0) {
+ /* hole case, need to fill in bh->b_size */
+ bh->b_size = inode->i_sb->s_blocksize * map.m_len;
}
return ret;
}
@@ -1127,8 +1133,7 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len,
if (err)
break;
if (buffer_new(bh)) {
- unmap_underlying_metadata(bh->b_bdev,
- bh->b_blocknr);
+ clean_bdev_bh_alias(bh);
if (PageUptodate(page)) {
clear_buffer_new(bh);
set_buffer_uptodate(bh);
@@ -1166,7 +1171,8 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len,
if (unlikely(err))
page_zero_new_buffers(page, from, to);
else if (decrypt)
- err = fscrypt_decrypt_page(page);
+ err = fscrypt_decrypt_page(page->mapping->host, page,
+ PAGE_SIZE, 0, page->index);
return err;
}
#endif
@@ -2360,11 +2366,8 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
BUG_ON(map->m_len == 0);
if (map->m_flags & EXT4_MAP_NEW) {
- struct block_device *bdev = inode->i_sb->s_bdev;
- int i;
-
- for (i = 0; i < map->m_len; i++)
- unmap_underlying_metadata(bdev, map->m_pblk + i);
+ clean_bdev_aliases(inode->i_sb->s_bdev, map->m_pblk,
+ map->m_len);
}
return 0;
}
@@ -2891,7 +2894,8 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
index = pos >> PAGE_SHIFT;
- if (ext4_nonda_switch(inode->i_sb)) {
+ if (ext4_nonda_switch(inode->i_sb) ||
+ S_ISLNK(inode->i_mode)) {
*fsdata = (void *)FALL_BACK_TO_NONDELALLOC;
return ext4_write_begin(file, mapping, pos,
len, flags, pagep, fsdata);
@@ -3268,53 +3272,159 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
}
#ifdef CONFIG_FS_DAX
-/*
- * Get block function for DAX IO and mmap faults. It takes care of converting
- * unwritten extents to written ones and initializes new / converted blocks
- * to zeros.
- */
-int ext4_dax_get_block(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int create)
+static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
+ unsigned flags, struct iomap *iomap)
{
+ unsigned int blkbits = inode->i_blkbits;
+ unsigned long first_block = offset >> blkbits;
+ unsigned long last_block = (offset + length - 1) >> blkbits;
+ struct ext4_map_blocks map;
int ret;
- ext4_debug("inode %lu, create flag %d\n", inode->i_ino, create);
- if (!create)
- return _ext4_get_block(inode, iblock, bh_result, 0);
+ if (WARN_ON_ONCE(ext4_has_inline_data(inode)))
+ return -ERANGE;
- ret = ext4_get_block_trans(inode, iblock, bh_result,
- EXT4_GET_BLOCKS_PRE_IO |
- EXT4_GET_BLOCKS_CREATE_ZERO);
- if (ret < 0)
- return ret;
+ map.m_lblk = first_block;
+ map.m_len = last_block - first_block + 1;
- if (buffer_unwritten(bh_result)) {
+ if (!(flags & IOMAP_WRITE)) {
+ ret = ext4_map_blocks(NULL, inode, &map, 0);
+ } else {
+ int dio_credits;
+ handle_t *handle;
+ int retries = 0;
+
+ /* Trim mapping request to maximum we can map at once for DIO */
+ if (map.m_len > DIO_MAX_BLOCKS)
+ map.m_len = DIO_MAX_BLOCKS;
+ dio_credits = ext4_chunk_trans_blocks(inode, map.m_len);
+retry:
/*
- * We are protected by i_mmap_sem or i_mutex so we know block
- * cannot go away from under us even though we dropped
- * i_data_sem. Convert extent to written and write zeros there.
+ * Either we allocate blocks and then we don't get unwritten
+ * extent so we have reserved enough credits, or the blocks
+ * are already allocated and unwritten and in that case
+ * extent conversion fits in the credits as well.
*/
- ret = ext4_get_block_trans(inode, iblock, bh_result,
- EXT4_GET_BLOCKS_CONVERT |
- EXT4_GET_BLOCKS_CREATE_ZERO);
- if (ret < 0)
+ handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
+ dio_credits);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ ret = ext4_map_blocks(handle, inode, &map,
+ EXT4_GET_BLOCKS_CREATE_ZERO);
+ if (ret < 0) {
+ ext4_journal_stop(handle);
+ if (ret == -ENOSPC &&
+ ext4_should_retry_alloc(inode->i_sb, &retries))
+ goto retry;
return ret;
+ }
+
+ /*
+ * If we added blocks beyond i_size, we need to make sure they
+ * will get truncated if we crash before updating i_size in
+ * ext4_iomap_end(). For faults we don't need to do that (and
+ * even cannot because for orphan list operations inode_lock is
+ * required) - if we happen to instantiate block beyond i_size,
+ * it is because we race with truncate which has already added
+ * the inode to the orphan list.
+ */
+ if (!(flags & IOMAP_FAULT) && first_block + map.m_len >
+ (i_size_read(inode) + (1 << blkbits) - 1) >> blkbits) {
+ int err;
+
+ err = ext4_orphan_add(handle, inode);
+ if (err < 0) {
+ ext4_journal_stop(handle);
+ return err;
+ }
+ }
+ ext4_journal_stop(handle);
}
- /*
- * At least for now we have to clear BH_New so that DAX code
- * doesn't attempt to zero blocks again in a racy way.
- */
- clear_buffer_new(bh_result);
+
+ iomap->flags = 0;
+ iomap->bdev = inode->i_sb->s_bdev;
+ iomap->offset = first_block << blkbits;
+
+ if (ret == 0) {
+ iomap->type = IOMAP_HOLE;
+ iomap->blkno = IOMAP_NULL_BLOCK;
+ iomap->length = (u64)map.m_len << blkbits;
+ } else {
+ if (map.m_flags & EXT4_MAP_MAPPED) {
+ iomap->type = IOMAP_MAPPED;
+ } else if (map.m_flags & EXT4_MAP_UNWRITTEN) {
+ iomap->type = IOMAP_UNWRITTEN;
+ } else {
+ WARN_ON_ONCE(1);
+ return -EIO;
+ }
+ iomap->blkno = (sector_t)map.m_pblk << (blkbits - 9);
+ iomap->length = (u64)map.m_len << blkbits;
+ }
+
+ if (map.m_flags & EXT4_MAP_NEW)
+ iomap->flags |= IOMAP_F_NEW;
return 0;
}
-#else
-/* Just define empty function, it will never get called. */
-int ext4_dax_get_block(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int create)
+
+static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length,
+ ssize_t written, unsigned flags, struct iomap *iomap)
{
- BUG();
- return 0;
+ int ret = 0;
+ handle_t *handle;
+ int blkbits = inode->i_blkbits;
+ bool truncate = false;
+
+ if (!(flags & IOMAP_WRITE) || (flags & IOMAP_FAULT))
+ return 0;
+
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto orphan_del;
+ }
+ if (ext4_update_inode_size(inode, offset + written))
+ ext4_mark_inode_dirty(handle, inode);
+ /*
+ * We may need to truncate allocated but not written blocks beyond EOF.
+ */
+ if (iomap->offset + iomap->length >
+ ALIGN(inode->i_size, 1 << blkbits)) {
+ ext4_lblk_t written_blk, end_blk;
+
+ written_blk = (offset + written) >> blkbits;
+ end_blk = (offset + length) >> blkbits;
+ if (written_blk < end_blk && ext4_can_truncate(inode))
+ truncate = true;
+ }
+ /*
+ * Remove inode from orphan list if we were extending a inode and
+ * everything went fine.
+ */
+ if (!truncate && inode->i_nlink &&
+ !list_empty(&EXT4_I(inode)->i_orphan))
+ ext4_orphan_del(handle, inode);
+ ext4_journal_stop(handle);
+ if (truncate) {
+ ext4_truncate_failed_write(inode);
+orphan_del:
+ /*
+ * If truncate failed early the inode might still be on the
+ * orphan list; we need to make sure the inode is removed from
+ * the orphan list in that case.
+ */
+ if (inode->i_nlink)
+ ext4_orphan_del(NULL, inode);
+ }
+ return ret;
}
+
+struct iomap_ops ext4_iomap_ops = {
+ .iomap_begin = ext4_iomap_begin,
+ .iomap_end = ext4_iomap_end,
+};
+
#endif
static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
@@ -3436,19 +3546,7 @@ static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter)
iocb->private = NULL;
if (overwrite)
get_block_func = ext4_dio_get_block_overwrite;
- else if (IS_DAX(inode)) {
- /*
- * We can avoid zeroing for aligned DAX writes beyond EOF. Other
- * writes need zeroing either because they can race with page
- * faults or because they use partial blocks.
- */
- if (round_down(offset, 1<<inode->i_blkbits) >= inode->i_size &&
- ext4_aligned_io(inode, offset, count))
- get_block_func = ext4_dio_get_block;
- else
- get_block_func = ext4_dax_get_block;
- dio_flags = DIO_LOCKING;
- } else if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) ||
+ else if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) ||
round_down(offset, 1 << inode->i_blkbits) >= inode->i_size) {
get_block_func = ext4_dio_get_block;
dio_flags = DIO_LOCKING | DIO_SKIP_HOLES;
@@ -3462,14 +3560,9 @@ static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter)
#ifdef CONFIG_EXT4_FS_ENCRYPTION
BUG_ON(ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode));
#endif
- if (IS_DAX(inode)) {
- ret = dax_do_io(iocb, inode, iter, get_block_func,
- ext4_end_io_dio, dio_flags);
- } else
- ret = __blockdev_direct_IO(iocb, inode,
- inode->i_sb->s_bdev, iter,
- get_block_func,
- ext4_end_io_dio, NULL, dio_flags);
+ ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter,
+ get_block_func, ext4_end_io_dio, NULL,
+ dio_flags);
if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
EXT4_STATE_DIO_UNWRITTEN)) {
@@ -3538,6 +3631,7 @@ static ssize_t ext4_direct_IO_read(struct kiocb *iocb, struct iov_iter *iter)
{
struct address_space *mapping = iocb->ki_filp->f_mapping;
struct inode *inode = mapping->host;
+ size_t count = iov_iter_count(iter);
ssize_t ret;
/*
@@ -3546,19 +3640,12 @@ static ssize_t ext4_direct_IO_read(struct kiocb *iocb, struct iov_iter *iter)
* we are protected against page writeback as well.
*/
inode_lock_shared(inode);
- if (IS_DAX(inode)) {
- ret = dax_do_io(iocb, inode, iter, ext4_dio_get_block, NULL, 0);
- } else {
- size_t count = iov_iter_count(iter);
-
- ret = filemap_write_and_wait_range(mapping, iocb->ki_pos,
- iocb->ki_pos + count);
- if (ret)
- goto out_unlock;
- ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev,
- iter, ext4_dio_get_block,
- NULL, NULL, 0);
- }
+ ret = filemap_write_and_wait_range(mapping, iocb->ki_pos,
+ iocb->ki_pos + count);
+ if (ret)
+ goto out_unlock;
+ ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev,
+ iter, ext4_dio_get_block, NULL, NULL, 0);
out_unlock:
inode_unlock_shared(inode);
return ret;
@@ -3587,6 +3674,10 @@ static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
if (ext4_has_inline_data(inode))
return 0;
+ /* DAX uses iomap path now */
+ if (WARN_ON_ONCE(IS_DAX(inode)))
+ return 0;
+
trace_ext4_direct_IO_enter(inode, offset, count, iov_iter_rw(iter));
if (iov_iter_rw(iter) == READ)
ret = ext4_direct_IO_read(iocb, iter);
@@ -3615,6 +3706,13 @@ static int ext4_journalled_set_page_dirty(struct page *page)
return __set_page_dirty_nobuffers(page);
}
+static int ext4_set_page_dirty(struct page *page)
+{
+ WARN_ON_ONCE(!PageLocked(page) && !PageDirty(page));
+ WARN_ON_ONCE(!page_has_buffers(page));
+ return __set_page_dirty_buffers(page);
+}
+
static const struct address_space_operations ext4_aops = {
.readpage = ext4_readpage,
.readpages = ext4_readpages,
@@ -3622,6 +3720,7 @@ static const struct address_space_operations ext4_aops = {
.writepages = ext4_writepages,
.write_begin = ext4_write_begin,
.write_end = ext4_write_end,
+ .set_page_dirty = ext4_set_page_dirty,
.bmap = ext4_bmap,
.invalidatepage = ext4_invalidatepage,
.releasepage = ext4_releasepage,
@@ -3654,6 +3753,7 @@ static const struct address_space_operations ext4_da_aops = {
.writepages = ext4_writepages,
.write_begin = ext4_da_write_begin,
.write_end = ext4_da_write_end,
+ .set_page_dirty = ext4_set_page_dirty,
.bmap = ext4_bmap,
.invalidatepage = ext4_da_invalidatepage,
.releasepage = ext4_releasepage,
@@ -3743,7 +3843,8 @@ static int __ext4_block_zero_page_range(handle_t *handle,
/* We expect the key to be set. */
BUG_ON(!fscrypt_has_encryption_key(inode));
BUG_ON(blocksize != PAGE_SIZE);
- WARN_ON_ONCE(fscrypt_decrypt_page(page));
+ WARN_ON_ONCE(fscrypt_decrypt_page(page->mapping->host,
+ page, PAGE_SIZE, 0, page->index));
}
}
if (ext4_should_journal_data(inode)) {
@@ -3792,8 +3893,10 @@ static int ext4_block_zero_page_range(handle_t *handle,
if (length > max || length < 0)
length = max;
- if (IS_DAX(inode))
- return dax_zero_page_range(inode, from, length, ext4_get_block);
+ if (IS_DAX(inode)) {
+ return iomap_zero_range(inode, from, length, NULL,
+ &ext4_iomap_ops);
+ }
return __ext4_block_zero_page_range(handle, mapping, from, length);
}
@@ -4026,7 +4129,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
if (IS_SYNC(inode))
ext4_handle_sync(handle);
- inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+ inode->i_mtime = inode->i_ctime = current_time(inode);
ext4_mark_inode_dirty(handle, inode);
out_stop:
ext4_journal_stop(handle);
@@ -4091,10 +4194,11 @@ int ext4_inode_attach_jinode(struct inode *inode)
* that's fine - as long as they are linked from the inode, the post-crash
* ext4_truncate() run will find them and release them.
*/
-void ext4_truncate(struct inode *inode)
+int ext4_truncate(struct inode *inode)
{
struct ext4_inode_info *ei = EXT4_I(inode);
unsigned int credits;
+ int err = 0;
handle_t *handle;
struct address_space *mapping = inode->i_mapping;
@@ -4108,7 +4212,7 @@ void ext4_truncate(struct inode *inode)
trace_ext4_truncate_enter(inode);
if (!ext4_can_truncate(inode))
- return;
+ return 0;
ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
@@ -4120,13 +4224,13 @@ void ext4_truncate(struct inode *inode)
ext4_inline_data_truncate(inode, &has_inline);
if (has_inline)
- return;
+ return 0;
}
/* If we zero-out tail of the page, we have to create jinode for jbd2 */
if (inode->i_size & (inode->i_sb->s_blocksize - 1)) {
if (ext4_inode_attach_jinode(inode) < 0)
- return;
+ return 0;
}
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
@@ -4135,10 +4239,8 @@ void ext4_truncate(struct inode *inode)
credits = ext4_blocks_for_truncate(inode);
handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
- if (IS_ERR(handle)) {
- ext4_std_error(inode->i_sb, PTR_ERR(handle));
- return;
- }
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
if (inode->i_size & (inode->i_sb->s_blocksize - 1))
ext4_block_truncate_page(handle, mapping, inode->i_size);
@@ -4152,7 +4254,8 @@ void ext4_truncate(struct inode *inode)
* Implication: the file must always be in a sane, consistent
* truncatable state while each transaction commits.
*/
- if (ext4_orphan_add(handle, inode))
+ err = ext4_orphan_add(handle, inode);
+ if (err)
goto out_stop;
down_write(&EXT4_I(inode)->i_data_sem);
@@ -4160,11 +4263,13 @@ void ext4_truncate(struct inode *inode)
ext4_discard_preallocations(inode);
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
- ext4_ext_truncate(handle, inode);
+ err = ext4_ext_truncate(handle, inode);
else
ext4_ind_truncate(handle, inode);
up_write(&ei->i_data_sem);
+ if (err)
+ goto out_stop;
if (IS_SYNC(inode))
ext4_handle_sync(handle);
@@ -4180,11 +4285,12 @@ out_stop:
if (inode->i_nlink)
ext4_orphan_del(handle, inode);
- inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+ inode->i_mtime = inode->i_ctime = current_time(inode);
ext4_mark_inode_dirty(handle, inode);
ext4_journal_stop(handle);
trace_ext4_truncate_exit(inode);
+ return err;
}
/*
@@ -4352,7 +4458,9 @@ void ext4_set_inode_flags(struct inode *inode)
new_fl |= S_NOATIME;
if (flags & EXT4_DIRSYNC_FL)
new_fl |= S_DIRSYNC;
- if (test_opt(inode->i_sb, DAX) && S_ISREG(inode->i_mode))
+ if (test_opt(inode->i_sb, DAX) && S_ISREG(inode->i_mode) &&
+ !ext4_should_journal_data(inode) && !ext4_has_inline_data(inode) &&
+ !ext4_encrypted_inode(inode))
new_fl |= S_DAX;
inode_set_flags(inode, new_fl,
S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX);
@@ -4411,7 +4519,9 @@ static inline void ext4_iget_extra_inode(struct inode *inode,
{
__le32 *magic = (void *)raw_inode +
EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize;
- if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) {
+ if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize + sizeof(__le32) <=
+ EXT4_INODE_SIZE(inode->i_sb) &&
+ *magic == cpu_to_le32(EXT4_XATTR_MAGIC)) {
ext4_set_inode_state(inode, EXT4_STATE_XATTR);
ext4_find_inline_data_nolock(inode);
} else
@@ -4434,6 +4544,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
struct inode *inode;
journal_t *journal = EXT4_SB(sb)->s_journal;
long ret;
+ loff_t size;
int block;
uid_t i_uid;
gid_t i_gid;
@@ -4456,10 +4567,12 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
- EXT4_INODE_SIZE(inode->i_sb)) {
- EXT4_ERROR_INODE(inode, "bad extra_isize (%u != %u)",
- EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize,
- EXT4_INODE_SIZE(inode->i_sb));
+ EXT4_INODE_SIZE(inode->i_sb) ||
+ (ei->i_extra_isize & 3)) {
+ EXT4_ERROR_INODE(inode,
+ "bad extra_isize %u (inode size %u)",
+ ei->i_extra_isize,
+ EXT4_INODE_SIZE(inode->i_sb));
ret = -EFSCORRUPTED;
goto bad_inode;
}
@@ -4534,6 +4647,11 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
ei->i_file_acl |=
((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
inode->i_size = ext4_isize(raw_inode);
+ if ((size = i_size_read(inode)) < 0) {
+ EXT4_ERROR_INODE(inode, "bad i_size value: %lld", size);
+ ret = -EFSCORRUPTED;
+ goto bad_inode;
+ }
ei->i_disksize = inode->i_size;
#ifdef CONFIG_QUOTA
ei->i_reserved_quota = 0;
@@ -4577,6 +4695,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
if (ei->i_extra_isize == 0) {
/* The extra space is currently unused. Use it. */
+ BUILD_BUG_ON(sizeof(struct ext4_inode) & 3);
ei->i_extra_isize = sizeof(struct ext4_inode) -
EXT4_GOOD_OLD_INODE_SIZE;
} else {
@@ -5154,7 +5273,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
* update c/mtime in shrink case below
*/
if (!shrink) {
- inode->i_mtime = ext4_current_time(inode);
+ inode->i_mtime = current_time(inode);
inode->i_ctime = inode->i_mtime;
}
down_write(&EXT4_I(inode)->i_data_sem);
@@ -5199,12 +5318,15 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
* in data=journal mode to make pages freeable.
*/
truncate_pagecache(inode, inode->i_size);
- if (shrink)
- ext4_truncate(inode);
+ if (shrink) {
+ rc = ext4_truncate(inode);
+ if (rc)
+ error = rc;
+ }
up_write(&EXT4_I(inode)->i_mmap_sem);
}
- if (!rc) {
+ if (!error) {
setattr_copy(inode, attr);
mark_inode_dirty(inode);
}
@@ -5216,7 +5338,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
if (orphan && inode->i_nlink)
ext4_orphan_del(NULL, inode);
- if (!rc && (ia_valid & ATTR_MODE))
+ if (!error && (ia_valid & ATTR_MODE))
rc = posix_acl_chmod(inode, inode->i_mode);
err_out:
@@ -5455,18 +5577,20 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
err = ext4_reserve_inode_write(handle, inode, &iloc);
if (err)
return err;
- if (ext4_handle_valid(handle) &&
- EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
+ if (EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
!ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) {
/*
- * We need extra buffer credits since we may write into EA block
+ * In nojournal mode, we can immediately attempt to expand
+ * the inode. When journaled, we first need to obtain extra
+ * buffer credits since we may write into the EA block
* with this same handle. If journal_extend fails, then it will
* only result in a minor loss of functionality for that inode.
* If this is felt to be critical, then e2fsck should be run to
* force a large enough s_min_extra_isize.
*/
- if ((jbd2_journal_extend(handle,
- EXT4_DATA_TRANS_BLOCKS(inode->i_sb))) == 0) {
+ if (!ext4_handle_valid(handle) ||
+ jbd2_journal_extend(handle,
+ EXT4_DATA_TRANS_BLOCKS(inode->i_sb)) == 0) {
ret = ext4_expand_extra_isize(inode,
sbi->s_want_extra_isize,
iloc, handle);
@@ -5620,6 +5744,11 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
}
ext4_set_aops(inode);
+ /*
+ * Update inode->i_flags after EXT4_INODE_JOURNAL_DATA was updated.
+ * E.g. S_DAX may get cleared / set.
+ */
+ ext4_set_inode_flags(inode);
jbd2_journal_unlock_updates(journal);
percpu_up_write(&sbi->s_journal_flag_rwsem);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index bf5ae8ebbc97..49fd1371bfa2 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -153,7 +153,7 @@ static long swap_inode_boot_loader(struct super_block *sb,
swap_inode_data(inode, inode_bl);
- inode->i_ctime = inode_bl->i_ctime = ext4_current_time(inode);
+ inode->i_ctime = inode_bl->i_ctime = current_time(inode);
spin_lock(&sbi->s_next_gen_lock);
inode->i_generation = sbi->s_next_generation++;
@@ -191,6 +191,7 @@ journal_err_out:
return err;
}
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
static int uuid_is_zero(__u8 u[16])
{
int i;
@@ -200,6 +201,7 @@ static int uuid_is_zero(__u8 u[16])
return 0;
return 1;
}
+#endif
static int ext4_ioctl_setflags(struct inode *inode,
unsigned int flags)
@@ -248,8 +250,11 @@ static int ext4_ioctl_setflags(struct inode *inode,
err = -EOPNOTSUPP;
goto flags_out;
}
- } else if (oldflags & EXT4_EOFBLOCKS_FL)
- ext4_truncate(inode);
+ } else if (oldflags & EXT4_EOFBLOCKS_FL) {
+ err = ext4_truncate(inode);
+ if (err)
+ goto flags_out;
+ }
handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
if (IS_ERR(handle)) {
@@ -265,6 +270,9 @@ static int ext4_ioctl_setflags(struct inode *inode,
for (i = 0, mask = 1; i < 32; i++, mask <<= 1) {
if (!(mask & EXT4_FL_USER_MODIFIABLE))
continue;
+ /* These flags get special treatment later */
+ if (mask == EXT4_JOURNAL_DATA_FL || mask == EXT4_EXTENTS_FL)
+ continue;
if (mask & flags)
ext4_set_inode_flag(inode, i);
else
@@ -272,7 +280,7 @@ static int ext4_ioctl_setflags(struct inode *inode,
}
ext4_set_inode_flags(inode);
- inode->i_ctime = ext4_current_time(inode);
+ inode->i_ctime = current_time(inode);
err = ext4_mark_iloc_dirty(handle, inode, &iloc);
flags_err:
@@ -368,7 +376,7 @@ static int ext4_ioctl_setproject(struct file *filp, __u32 projid)
}
EXT4_I(inode)->i_projid = kprojid;
- inode->i_ctime = ext4_current_time(inode);
+ inode->i_ctime = current_time(inode);
out_dirty:
rc = ext4_mark_iloc_dirty(handle, inode, &iloc);
if (!err)
@@ -409,6 +417,10 @@ static inline __u32 ext4_iflags_to_xflags(unsigned long iflags)
return xflags;
}
+#define EXT4_SUPPORTED_FS_XFLAGS (FS_XFLAG_SYNC | FS_XFLAG_IMMUTABLE | \
+ FS_XFLAG_APPEND | FS_XFLAG_NODUMP | \
+ FS_XFLAG_NOATIME | FS_XFLAG_PROJINHERIT)
+
/* Transfer xflags flags to internal */
static inline unsigned long ext4_xflags_to_iflags(__u32 xflags)
{
@@ -453,12 +465,22 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
if (get_user(flags, (int __user *) arg))
return -EFAULT;
+ if (flags & ~EXT4_FL_USER_VISIBLE)
+ return -EOPNOTSUPP;
+ /*
+ * chattr(1) grabs flags via GETFLAGS, modifies the result and
+ * passes that to SETFLAGS. So we cannot easily make SETFLAGS
+ * more restrictive than just silently masking off visible but
+ * not settable flags as we always did.
+ */
+ flags &= EXT4_FL_USER_MODIFIABLE;
+ if (ext4_mask_flags(inode->i_mode, flags) != flags)
+ return -EOPNOTSUPP;
+
err = mnt_want_write_file(filp);
if (err)
return err;
- flags = ext4_mask_flags(inode->i_mode, flags);
-
inode_lock(inode);
err = ext4_ioctl_setflags(inode, flags);
inode_unlock(inode);
@@ -500,7 +522,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
}
err = ext4_reserve_inode_write(handle, inode, &iloc);
if (err == 0) {
- inode->i_ctime = ext4_current_time(inode);
+ inode->i_ctime = current_time(inode);
inode->i_generation = generation;
err = ext4_mark_iloc_dirty(handle, inode, &iloc);
}
@@ -765,28 +787,19 @@ resizefs_out:
}
case EXT4_IOC_PRECACHE_EXTENTS:
return ext4_ext_precache(inode);
- case EXT4_IOC_SET_ENCRYPTION_POLICY: {
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
- struct fscrypt_policy policy;
+ case EXT4_IOC_SET_ENCRYPTION_POLICY:
if (!ext4_has_feature_encrypt(sb))
return -EOPNOTSUPP;
+ return fscrypt_ioctl_set_policy(filp, (const void __user *)arg);
- if (copy_from_user(&policy,
- (struct fscrypt_policy __user *)arg,
- sizeof(policy)))
- return -EFAULT;
- return fscrypt_process_policy(filp, &policy);
-#else
- return -EOPNOTSUPP;
-#endif
- }
case EXT4_IOC_GET_ENCRYPTION_PWSALT: {
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
int err, err2;
struct ext4_sb_info *sbi = EXT4_SB(sb);
handle_t *handle;
- if (!ext4_sb_has_crypto(sb))
+ if (!ext4_has_feature_encrypt(sb))
return -EOPNOTSUPP;
if (uuid_is_zero(sbi->s_es->s_encrypt_pw_salt)) {
err = mnt_want_write_file(filp);
@@ -816,24 +829,13 @@ resizefs_out:
sbi->s_es->s_encrypt_pw_salt, 16))
return -EFAULT;
return 0;
- }
- case EXT4_IOC_GET_ENCRYPTION_POLICY: {
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
- struct fscrypt_policy policy;
- int err = 0;
-
- if (!ext4_encrypted_inode(inode))
- return -ENOENT;
- err = fscrypt_get_policy(inode, &policy);
- if (err)
- return err;
- if (copy_to_user((void __user *)arg, &policy, sizeof(policy)))
- return -EFAULT;
- return 0;
#else
return -EOPNOTSUPP;
#endif
}
+ case EXT4_IOC_GET_ENCRYPTION_POLICY:
+ return fscrypt_ioctl_get_policy(filp, (void __user *)arg);
+
case EXT4_IOC_FSGETXATTR:
{
struct fsxattr fa;
@@ -865,13 +867,17 @@ resizefs_out:
if (!inode_owner_or_capable(inode))
return -EACCES;
+ if (fa.fsx_xflags & ~EXT4_SUPPORTED_FS_XFLAGS)
+ return -EOPNOTSUPP;
+
+ flags = ext4_xflags_to_iflags(fa.fsx_xflags);
+ if (ext4_mask_flags(inode->i_mode, flags) != flags)
+ return -EOPNOTSUPP;
+
err = mnt_want_write_file(filp);
if (err)
return err;
- flags = ext4_xflags_to_iflags(fa.fsx_xflags);
- flags = ext4_mask_flags(inode->i_mode, flags);
-
inode_lock(inode);
flags = (ei->i_flags & ~EXT4_FL_XFLAG_VISIBLE) |
(flags & EXT4_FL_XFLAG_VISIBLE);
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index f418f55c2bbe..7ae43c59bc79 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -669,7 +669,7 @@ static void ext4_mb_mark_free_simple(struct super_block *sb,
ext4_grpblk_t min;
ext4_grpblk_t max;
ext4_grpblk_t chunk;
- unsigned short border;
+ unsigned int border;
BUG_ON(len > EXT4_CLUSTERS_PER_GROUP(sb));
@@ -2287,7 +2287,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
struct ext4_group_info *grinfo;
struct sg {
struct ext4_group_info info;
- ext4_grpblk_t counters[16];
+ ext4_grpblk_t counters[EXT4_MAX_BLOCK_LOG_SIZE + 2];
} sg;
group--;
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index d89754ef1aab..eb9835638680 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -35,7 +35,7 @@ static void ext4_mmp_csum_set(struct super_block *sb, struct mmp_struct *mmp)
}
/*
- * Write the MMP block using WRITE_SYNC to try to get the block on-disk
+ * Write the MMP block using REQ_SYNC to try to get the block on-disk
* faster.
*/
static int write_mmp_block(struct super_block *sb, struct buffer_head *bh)
@@ -52,7 +52,7 @@ static int write_mmp_block(struct super_block *sb, struct buffer_head *bh)
lock_buffer(bh);
bh->b_end_io = end_buffer_write_sync;
get_bh(bh);
- submit_bh(REQ_OP_WRITE, WRITE_SYNC | REQ_META | REQ_PRIO, bh);
+ submit_bh(REQ_OP_WRITE, REQ_SYNC | REQ_META | REQ_PRIO, bh);
wait_on_buffer(bh);
sb_end_write(sb);
if (unlikely(!buffer_uptodate(bh)))
@@ -88,7 +88,7 @@ static int read_mmp_block(struct super_block *sb, struct buffer_head **bh,
get_bh(*bh);
lock_buffer(*bh);
(*bh)->b_end_io = end_buffer_read_sync;
- submit_bh(REQ_OP_READ, READ_SYNC | REQ_META | REQ_PRIO, *bh);
+ submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO, *bh);
wait_on_buffer(*bh);
if (!buffer_uptodate(*bh)) {
ret = -EIO;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 104f8bfba718..eadba919f26b 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1941,7 +1941,7 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname,
* happen is that the times are slightly out of date
* and/or different from the directory change time.
*/
- dir->i_mtime = dir->i_ctime = ext4_current_time(dir);
+ dir->i_mtime = dir->i_ctime = current_time(dir);
ext4_update_dx_flag(dir);
dir->i_version++;
ext4_mark_inode_dirty(handle, dir);
@@ -2987,7 +2987,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
* recovery. */
inode->i_size = 0;
ext4_orphan_add(handle, inode);
- inode->i_ctime = dir->i_ctime = dir->i_mtime = ext4_current_time(inode);
+ inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
ext4_mark_inode_dirty(handle, inode);
ext4_dec_count(handle, dir);
ext4_update_dx_flag(dir);
@@ -3050,13 +3050,13 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
retval = ext4_delete_entry(handle, dir, de, bh);
if (retval)
goto end_unlink;
- dir->i_ctime = dir->i_mtime = ext4_current_time(dir);
+ dir->i_ctime = dir->i_mtime = current_time(dir);
ext4_update_dx_flag(dir);
ext4_mark_inode_dirty(handle, dir);
drop_nlink(inode);
if (!inode->i_nlink)
ext4_orphan_add(handle, inode);
- inode->i_ctime = ext4_current_time(inode);
+ inode->i_ctime = current_time(inode);
ext4_mark_inode_dirty(handle, inode);
end_unlink:
@@ -3254,7 +3254,7 @@ retry:
if (IS_DIRSYNC(dir))
ext4_handle_sync(handle);
- inode->i_ctime = ext4_current_time(inode);
+ inode->i_ctime = current_time(inode);
ext4_inc_count(handle, inode);
ihold(inode);
@@ -3381,7 +3381,7 @@ static int ext4_setent(handle_t *handle, struct ext4_renament *ent,
ent->de->file_type = file_type;
ent->dir->i_version++;
ent->dir->i_ctime = ent->dir->i_mtime =
- ext4_current_time(ent->dir);
+ current_time(ent->dir);
ext4_mark_inode_dirty(handle, ent->dir);
BUFFER_TRACE(ent->bh, "call ext4_handle_dirty_metadata");
if (!ent->inlined) {
@@ -3651,7 +3651,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
* Like most other Unix systems, set the ctime for inodes on a
* rename.
*/
- old.inode->i_ctime = ext4_current_time(old.inode);
+ old.inode->i_ctime = current_time(old.inode);
ext4_mark_inode_dirty(handle, old.inode);
if (!whiteout) {
@@ -3663,9 +3663,9 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
if (new.inode) {
ext4_dec_count(handle, new.inode);
- new.inode->i_ctime = ext4_current_time(new.inode);
+ new.inode->i_ctime = current_time(new.inode);
}
- old.dir->i_ctime = old.dir->i_mtime = ext4_current_time(old.dir);
+ old.dir->i_ctime = old.dir->i_mtime = current_time(old.dir);
ext4_update_dx_flag(old.dir);
if (old.dir_bh) {
retval = ext4_rename_dir_finish(handle, &old, new.dir->i_ino);
@@ -3723,6 +3723,7 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
};
u8 new_file_type;
int retval;
+ struct timespec ctime;
if ((ext4_encrypted_inode(old_dir) ||
ext4_encrypted_inode(new_dir)) &&
@@ -3823,8 +3824,9 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
* Like most other Unix systems, set the ctime for inodes on a
* rename.
*/
- old.inode->i_ctime = ext4_current_time(old.inode);
- new.inode->i_ctime = ext4_current_time(new.inode);
+ ctime = current_time(old.inode);
+ old.inode->i_ctime = ctime;
+ new.inode->i_ctime = ctime;
ext4_mark_inode_dirty(handle, old.inode);
ext4_mark_inode_dirty(handle, new.inode);
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 0094923e5ebf..d83b0f3c5fe9 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -340,7 +340,7 @@ void ext4_io_submit(struct ext4_io_submit *io)
if (bio) {
int io_op_flags = io->io_wbc->sync_mode == WB_SYNC_ALL ?
- WRITE_SYNC : 0;
+ REQ_SYNC : 0;
bio_set_op_attrs(io->io_bio, REQ_OP_WRITE, io_op_flags);
submit_bio(io->io_bio);
}
@@ -457,7 +457,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
}
if (buffer_new(bh)) {
clear_buffer_new(bh);
- unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
+ clean_bdev_bh_alias(bh);
}
set_buffer_async_write(bh);
nr_to_submit++;
@@ -470,7 +470,8 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
gfp_t gfp_flags = GFP_NOFS;
retry_encrypt:
- data_page = fscrypt_encrypt_page(inode, page, gfp_flags);
+ data_page = fscrypt_encrypt_page(inode, page, PAGE_SIZE, 0,
+ page->index, gfp_flags);
if (IS_ERR(data_page)) {
ret = PTR_ERR(data_page);
if (ret == -ENOMEM && wbc->sync_mode == WB_SYNC_ALL) {
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 52b0530c5d65..dfc8309d7755 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -863,7 +863,6 @@ static void ext4_put_super(struct super_block *sb)
percpu_counter_destroy(&sbi->s_dirs_counter);
percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
percpu_free_rwsem(&sbi->s_journal_flag_rwsem);
- brelse(sbi->s_sbh);
#ifdef CONFIG_QUOTA
for (i = 0; i < EXT4_MAXQUOTAS; i++)
kfree(sbi->s_qf_names[i]);
@@ -895,6 +894,7 @@ static void ext4_put_super(struct super_block *sb)
}
if (sbi->s_mmp_tsk)
kthread_stop(sbi->s_mmp_tsk);
+ brelse(sbi->s_sbh);
sb->s_fs_info = NULL;
/*
* Now that we are completely done shutting down the
@@ -1114,37 +1114,55 @@ static int ext4_prepare_context(struct inode *inode)
static int ext4_set_context(struct inode *inode, const void *ctx, size_t len,
void *fs_data)
{
- handle_t *handle;
- int res, res2;
+ handle_t *handle = fs_data;
+ int res, res2, retries = 0;
+
+ /*
+ * If a journal handle was specified, then the encryption context is
+ * being set on a new inode via inheritance and is part of a larger
+ * transaction to create the inode. Otherwise the encryption context is
+ * being set on an existing inode in its own transaction. Only in the
+ * latter case should the "retry on ENOSPC" logic be used.
+ */
- /* fs_data is null when internally used. */
- if (fs_data) {
- res = ext4_xattr_set(inode, EXT4_XATTR_INDEX_ENCRYPTION,
- EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, ctx,
- len, 0);
+ if (handle) {
+ res = ext4_xattr_set_handle(handle, inode,
+ EXT4_XATTR_INDEX_ENCRYPTION,
+ EXT4_XATTR_NAME_ENCRYPTION_CONTEXT,
+ ctx, len, 0);
if (!res) {
ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT);
ext4_clear_inode_state(inode,
EXT4_STATE_MAY_INLINE_DATA);
+ /*
+ * Update inode->i_flags - e.g. S_DAX may get disabled
+ */
+ ext4_set_inode_flags(inode);
}
return res;
}
+retry:
handle = ext4_journal_start(inode, EXT4_HT_MISC,
ext4_jbd2_credits_xattr(inode));
if (IS_ERR(handle))
return PTR_ERR(handle);
- res = ext4_xattr_set(inode, EXT4_XATTR_INDEX_ENCRYPTION,
- EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, ctx,
- len, 0);
+ res = ext4_xattr_set_handle(handle, inode, EXT4_XATTR_INDEX_ENCRYPTION,
+ EXT4_XATTR_NAME_ENCRYPTION_CONTEXT,
+ ctx, len, 0);
if (!res) {
ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT);
+ /* Update inode->i_flags - e.g. S_DAX may get disabled */
+ ext4_set_inode_flags(inode);
res = ext4_mark_inode_dirty(handle, inode);
if (res)
EXT4_ERROR_INODE(inode, "Failed to mark inode dirty");
}
res2 = ext4_journal_stop(handle);
+
+ if (res == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+ goto retry;
if (!res)
res = res2;
return res;
@@ -1883,12 +1901,6 @@ static int parse_options(char *options, struct super_block *sb,
return 0;
}
}
- if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA &&
- test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
- ext4_msg(sb, KERN_ERR, "can't mount with journal_async_commit "
- "in data=ordered mode");
- return 0;
- }
return 1;
}
@@ -2330,7 +2342,7 @@ static void ext4_orphan_cleanup(struct super_block *sb,
struct ext4_super_block *es)
{
unsigned int s_flags = sb->s_flags;
- int nr_orphans = 0, nr_truncates = 0;
+ int ret, nr_orphans = 0, nr_truncates = 0;
#ifdef CONFIG_QUOTA
int i;
#endif
@@ -2412,7 +2424,9 @@ static void ext4_orphan_cleanup(struct super_block *sb,
inode->i_ino, inode->i_size);
inode_lock(inode);
truncate_inode_pages(inode->i_mapping, inode->i_size);
- ext4_truncate(inode);
+ ret = ext4_truncate(inode);
+ if (ret)
+ ext4_std_error(inode->i_sb, ret);
inode_unlock(inode);
nr_truncates++;
} else {
@@ -3193,10 +3207,15 @@ static int count_overhead(struct super_block *sb, ext4_group_t grp,
ext4_set_bit(s++, buf);
count++;
}
- for (j = ext4_bg_num_gdb(sb, grp); j > 0; j--) {
- ext4_set_bit(EXT4_B2C(sbi, s++), buf);
- count++;
+ j = ext4_bg_num_gdb(sb, grp);
+ if (s + j > EXT4_BLOCKS_PER_GROUP(sb)) {
+ ext4_error(sb, "Invalid number of block group "
+ "descriptor blocks: %d", j);
+ j = EXT4_BLOCKS_PER_GROUP(sb) - s;
}
+ count += j;
+ for (; j > 0; j--)
+ ext4_set_bit(EXT4_B2C(sbi, s++), buf);
}
if (!count)
return 0;
@@ -3301,7 +3320,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
char *orig_data = kstrdup(data, GFP_KERNEL);
struct buffer_head *bh;
struct ext4_super_block *es = NULL;
- struct ext4_sb_info *sbi;
+ struct ext4_sb_info *sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
ext4_fsblk_t block;
ext4_fsblk_t sb_block = get_sb_block(&data);
ext4_fsblk_t logical_sb_block;
@@ -3320,16 +3339,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
ext4_group_t first_not_zeroed;
- sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
- if (!sbi)
- goto out_free_orig;
+ if ((data && !orig_data) || !sbi)
+ goto out_free_base;
sbi->s_blockgroup_lock =
kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
- if (!sbi->s_blockgroup_lock) {
- kfree(sbi);
- goto out_free_orig;
- }
+ if (!sbi->s_blockgroup_lock)
+ goto out_free_base;
+
sb->s_fs_info = sbi;
sbi->s_sb = sb;
sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
@@ -3475,11 +3492,19 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
*/
sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT;
- if (!parse_options((char *) sbi->s_es->s_mount_opts, sb,
- &journal_devnum, &journal_ioprio, 0)) {
- ext4_msg(sb, KERN_WARNING,
- "failed to parse options in superblock: %s",
- sbi->s_es->s_mount_opts);
+ if (sbi->s_es->s_mount_opts[0]) {
+ char *s_mount_opts = kstrndup(sbi->s_es->s_mount_opts,
+ sizeof(sbi->s_es->s_mount_opts),
+ GFP_KERNEL);
+ if (!s_mount_opts)
+ goto failed_mount;
+ if (!parse_options(s_mount_opts, sb, &journal_devnum,
+ &journal_ioprio, 0)) {
+ ext4_msg(sb, KERN_WARNING,
+ "failed to parse options in superblock: %s",
+ s_mount_opts);
+ }
+ kfree(s_mount_opts);
}
sbi->s_def_mount_opt = sbi->s_mount_opt;
if (!parse_options((char *) data, sb, &journal_devnum,
@@ -3505,6 +3530,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
"both data=journal and dax");
goto failed_mount;
}
+ if (ext4_has_feature_encrypt(sb)) {
+ ext4_msg(sb, KERN_WARNING,
+ "encrypted files will use data=ordered "
+ "instead of data journaling mode");
+ }
if (test_opt(sb, DELALLOC))
clear_opt(sb, DELALLOC);
} else {
@@ -3660,12 +3690,16 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group);
sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group);
- if (EXT4_INODE_SIZE(sb) == 0 || EXT4_INODES_PER_GROUP(sb) == 0)
- goto cantfind_ext4;
sbi->s_inodes_per_block = blocksize / EXT4_INODE_SIZE(sb);
if (sbi->s_inodes_per_block == 0)
goto cantfind_ext4;
+ if (sbi->s_inodes_per_group < sbi->s_inodes_per_block ||
+ sbi->s_inodes_per_group > blocksize * 8) {
+ ext4_msg(sb, KERN_ERR, "invalid inodes per group: %lu\n",
+ sbi->s_blocks_per_group);
+ goto failed_mount;
+ }
sbi->s_itb_per_group = sbi->s_inodes_per_group /
sbi->s_inodes_per_block;
sbi->s_desc_per_block = blocksize / EXT4_DESC_SIZE(sb);
@@ -3748,13 +3782,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
}
sbi->s_cluster_ratio = clustersize / blocksize;
- if (sbi->s_inodes_per_group > blocksize * 8) {
- ext4_msg(sb, KERN_ERR,
- "#inodes per group too big: %lu",
- sbi->s_inodes_per_group);
- goto failed_mount;
- }
-
/* Do we have standard group size of clustersize * 8 blocks ? */
if (sbi->s_blocks_per_group == clustersize << 3)
set_opt2(sb, STD_GROUP_SIZE);
@@ -3814,6 +3841,15 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
(EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb)));
db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
EXT4_DESC_PER_BLOCK(sb);
+ if (ext4_has_feature_meta_bg(sb)) {
+ if (le32_to_cpu(es->s_first_meta_bg) >= db_count) {
+ ext4_msg(sb, KERN_WARNING,
+ "first meta block group too large: %u "
+ "(group descriptor block count %u)",
+ le32_to_cpu(es->s_first_meta_bg), db_count);
+ goto failed_mount;
+ }
+ }
sbi->s_group_desc = ext4_kvmalloc(db_count *
sizeof(struct buffer_head *),
GFP_KERNEL);
@@ -3967,6 +4003,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
default:
break;
}
+
+ if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA &&
+ test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
+ ext4_msg(sb, KERN_ERR, "can't mount with "
+ "journal_async_commit in data=ordered mode");
+ goto failed_mount_wq;
+ }
+
set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
sbi->s_journal->j_commit_callback = ext4_journal_commit_callback;
@@ -4160,7 +4204,9 @@ no_journal:
if (___ratelimit(&ext4_mount_msg_ratelimit, "EXT4-fs mount"))
ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. "
- "Opts: %s%s%s", descr, sbi->s_es->s_mount_opts,
+ "Opts: %.*s%s%s", descr,
+ (int) sizeof(sbi->s_es->s_mount_opts),
+ sbi->s_es->s_mount_opts,
*sbi->s_es->s_mount_opts ? "; " : "", orig_data);
if (es->s_error_count)
@@ -4239,8 +4285,8 @@ failed_mount:
out_fail:
sb->s_fs_info = NULL;
kfree(sbi->s_blockgroup_lock);
+out_free_base:
kfree(sbi);
-out_free_orig:
kfree(orig_data);
return err ? err : ret;
}
@@ -4550,7 +4596,8 @@ static int ext4_commit_super(struct super_block *sb, int sync)
&EXT4_SB(sb)->s_freeinodes_counter));
BUFFER_TRACE(sbh, "marking dirty");
ext4_superblock_csum_set(sb);
- lock_buffer(sbh);
+ if (sync)
+ lock_buffer(sbh);
if (buffer_write_io_error(sbh)) {
/*
* Oh, dear. A previous attempt to write the
@@ -4566,10 +4613,10 @@ static int ext4_commit_super(struct super_block *sb, int sync)
set_buffer_uptodate(sbh);
}
mark_buffer_dirty(sbh);
- unlock_buffer(sbh);
if (sync) {
+ unlock_buffer(sbh);
error = __sync_dirty_buffer(sbh,
- test_opt(sb, BARRIER) ? WRITE_FUA : WRITE_SYNC);
+ test_opt(sb, BARRIER) ? REQ_FUA : REQ_SYNC);
if (error)
return error;
@@ -4857,6 +4904,13 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
err = -EINVAL;
goto restore_opts;
}
+ } else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) {
+ if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
+ ext4_msg(sb, KERN_ERR, "can't mount with "
+ "journal_async_commit in data=ordered mode");
+ err = -EINVAL;
+ goto restore_opts;
+ }
}
if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_DAX) {
@@ -5366,7 +5420,7 @@ static int ext4_quota_off(struct super_block *sb, int type)
handle = ext4_journal_start(inode, EXT4_HT_QUOTA, 1);
if (IS_ERR(handle))
goto out;
- inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ inode->i_mtime = inode->i_ctime = current_time(inode);
ext4_mark_inode_dirty(handle, inode);
ext4_journal_stop(handle);
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index d77be9e9f535..5a94fa52b74f 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -185,6 +185,7 @@ ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end,
{
struct ext4_xattr_entry *e = entry;
+ /* Find the end of the names list */
while (!IS_LAST_ENTRY(e)) {
struct ext4_xattr_entry *next = EXT4_XATTR_NEXT(e);
if ((void *)next >= end)
@@ -192,15 +193,29 @@ ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end,
e = next;
}
+ /* Check the values */
while (!IS_LAST_ENTRY(entry)) {
if (entry->e_value_block != 0)
return -EFSCORRUPTED;
- if (entry->e_value_size != 0 &&
- (value_start + le16_to_cpu(entry->e_value_offs) <
- (void *)e + sizeof(__u32) ||
- value_start + le16_to_cpu(entry->e_value_offs) +
- le32_to_cpu(entry->e_value_size) > end))
- return -EFSCORRUPTED;
+ if (entry->e_value_size != 0) {
+ u16 offs = le16_to_cpu(entry->e_value_offs);
+ u32 size = le32_to_cpu(entry->e_value_size);
+ void *value;
+
+ /*
+ * The value cannot overlap the names, and the value
+ * with padding cannot extend beyond 'end'. Check both
+ * the padded and unpadded sizes, since the size may
+ * overflow to 0 when adding padding.
+ */
+ if (offs > end - value_start)
+ return -EFSCORRUPTED;
+ value = value_start + offs;
+ if (value < (void *)e + sizeof(u32) ||
+ size > end - value ||
+ EXT4_XATTR_SIZE(size) > end - value)
+ return -EFSCORRUPTED;
+ }
entry = EXT4_XATTR_NEXT(entry);
}
@@ -231,13 +246,12 @@ static int
__xattr_check_inode(struct inode *inode, struct ext4_xattr_ibody_header *header,
void *end, const char *function, unsigned int line)
{
- struct ext4_xattr_entry *entry = IFIRST(header);
int error = -EFSCORRUPTED;
- if (((void *) header >= end) ||
+ if (end - (void *)header < sizeof(*header) + sizeof(u32) ||
(header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)))
goto errout;
- error = ext4_xattr_check_names(entry, end, entry);
+ error = ext4_xattr_check_names(IFIRST(header), end, IFIRST(header));
errout:
if (error)
__ext4_error_inode(inode, function, line, 0,
@@ -1109,7 +1123,7 @@ int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode,
return 0;
}
-static int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
+static int ext4_xattr_ibody_set(struct inode *inode,
struct ext4_xattr_info *i,
struct ext4_xattr_ibody_find *is)
{
@@ -1216,7 +1230,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
}
if (!value) {
if (!is.s.not_found)
- error = ext4_xattr_ibody_set(handle, inode, &i, &is);
+ error = ext4_xattr_ibody_set(inode, &i, &is);
else if (!bs.s.not_found)
error = ext4_xattr_block_set(handle, inode, &i, &bs);
} else {
@@ -1227,7 +1241,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
if (!bs.s.not_found && ext4_xattr_value_same(&bs.s, &i))
goto cleanup;
- error = ext4_xattr_ibody_set(handle, inode, &i, &is);
+ error = ext4_xattr_ibody_set(inode, &i, &is);
if (!error && !bs.s.not_found) {
i.value = NULL;
error = ext4_xattr_block_set(handle, inode, &i, &bs);
@@ -1242,14 +1256,13 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
goto cleanup;
if (!is.s.not_found) {
i.value = NULL;
- error = ext4_xattr_ibody_set(handle, inode, &i,
- &is);
+ error = ext4_xattr_ibody_set(inode, &i, &is);
}
}
}
if (!error) {
ext4_xattr_update_super_block(handle, inode->i_sb);
- inode->i_ctime = ext4_current_time(inode);
+ inode->i_ctime = current_time(inode);
if (!value)
ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND);
error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
@@ -1384,7 +1397,7 @@ static int ext4_xattr_move_to_block(handle_t *handle, struct inode *inode,
goto out;
/* Remove the chosen entry from the inode */
- error = ext4_xattr_ibody_set(handle, inode, &i, is);
+ error = ext4_xattr_ibody_set(inode, &i, is);
if (error)
goto out;
diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
index 6fe23af509e1..8f487692c21f 100644
--- a/fs/f2fs/acl.c
+++ b/fs/f2fs/acl.c
@@ -384,7 +384,7 @@ int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage,
if (error)
return error;
- f2fs_mark_inode_dirty_sync(inode);
+ f2fs_mark_inode_dirty_sync(inode, true);
if (default_acl) {
error = __f2fs_set_acl(inode, ACL_TYPE_DEFAULT, default_acl,
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 7e9b504bd8b2..f73ee9534d83 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -65,7 +65,7 @@ static struct page *__get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index,
.sbi = sbi,
.type = META,
.op = REQ_OP_READ,
- .op_flags = READ_SYNC | REQ_META | REQ_PRIO,
+ .op_flags = REQ_META | REQ_PRIO,
.old_blkaddr = index,
.new_blkaddr = index,
.encrypted_page = NULL,
@@ -160,7 +160,7 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages,
.sbi = sbi,
.type = META,
.op = REQ_OP_READ,
- .op_flags = sync ? (READ_SYNC | REQ_META | REQ_PRIO) : REQ_RAHEAD,
+ .op_flags = sync ? (REQ_META | REQ_PRIO) : REQ_RAHEAD,
.encrypted_page = NULL,
};
struct blk_plug plug;
@@ -228,7 +228,7 @@ void ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index)
f2fs_put_page(page, 0);
if (readahead)
- ra_meta_pages(sbi, index, MAX_BIO_BLOCKS(sbi), META_POR, true);
+ ra_meta_pages(sbi, index, BIO_MAX_PAGES, META_POR, true);
}
static int f2fs_write_meta_page(struct page *page,
@@ -770,7 +770,12 @@ int get_valid_checkpoint(struct f2fs_sb_info *sbi)
/* Sanity checking of checkpoint */
if (sanity_check_ckpt(sbi))
- goto fail_no_cp;
+ goto free_fail_no_cp;
+
+ if (cur_page == cp1)
+ sbi->cur_cp_pack = 1;
+ else
+ sbi->cur_cp_pack = 2;
if (cp_blks <= 1)
goto done;
@@ -793,6 +798,9 @@ done:
f2fs_put_page(cp2, 1);
return 0;
+free_fail_no_cp:
+ f2fs_put_page(cp1, 1);
+ f2fs_put_page(cp2, 1);
fail_no_cp:
kfree(sbi->ckpt);
return -EINVAL;
@@ -921,7 +929,11 @@ int f2fs_sync_inode_meta(struct f2fs_sb_info *sbi)
inode = igrab(&fi->vfs_inode);
spin_unlock(&sbi->inode_lock[DIRTY_META]);
if (inode) {
- update_inode_page(inode);
+ sync_inode_metadata(inode, 0);
+
+ /* it's on eviction */
+ if (is_inode_flag_set(inode, FI_DIRTY_INODE))
+ update_inode_page(inode);
iput(inode);
}
};
@@ -987,7 +999,7 @@ static void unblock_operations(struct f2fs_sb_info *sbi)
{
up_write(&sbi->node_write);
- build_free_nids(sbi);
+ build_free_nids(sbi, false);
f2fs_unlock_all(sbi);
}
@@ -998,7 +1010,7 @@ static void wait_on_all_pages_writeback(struct f2fs_sb_info *sbi)
for (;;) {
prepare_to_wait(&sbi->cp_wait, &wait, TASK_UNINTERRUPTIBLE);
- if (!atomic_read(&sbi->nr_wb_bios))
+ if (!get_pages(sbi, F2FS_WB_CP_DATA))
break;
io_schedule_timeout(5*HZ);
@@ -1123,7 +1135,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
le32_to_cpu(ckpt->checksum_offset)))
= cpu_to_le32(crc32);
- start_blk = __start_cp_addr(sbi);
+ start_blk = __start_cp_next_addr(sbi);
/* need to wait for end_io results */
wait_on_all_pages_writeback(sbi);
@@ -1184,9 +1196,9 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
if (unlikely(f2fs_cp_error(sbi)))
return -EIO;
- clear_prefree_segments(sbi, cpc);
clear_sbi_flag(sbi, SBI_IS_DIRTY);
clear_sbi_flag(sbi, SBI_NEED_CP);
+ __set_cp_next_pack(sbi);
/*
* redirty superblock if metadata like node page or inode cache is
@@ -1261,8 +1273,12 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
/* unlock all the fs_lock[] in do_checkpoint() */
err = do_checkpoint(sbi, cpc);
-
- f2fs_wait_all_discard_bio(sbi);
+ if (err) {
+ release_discard_addrs(sbi);
+ } else {
+ clear_prefree_segments(sbi, cpc);
+ f2fs_wait_all_discard_bio(sbi);
+ }
unblock_operations(sbi);
stat_inc_cp_count(sbi->stat_info);
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 9ae194fd2fdb..9ac262564fa6 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -29,6 +29,26 @@
#include "trace.h"
#include <trace/events/f2fs.h>
+static bool __is_cp_guaranteed(struct page *page)
+{
+ struct address_space *mapping = page->mapping;
+ struct inode *inode;
+ struct f2fs_sb_info *sbi;
+
+ if (!mapping)
+ return false;
+
+ inode = mapping->host;
+ sbi = F2FS_I_SB(inode);
+
+ if (inode->i_ino == F2FS_META_INO(sbi) ||
+ inode->i_ino == F2FS_NODE_INO(sbi) ||
+ S_ISDIR(inode->i_mode) ||
+ is_cold_data(page))
+ return true;
+ return false;
+}
+
static void f2fs_read_end_io(struct bio *bio)
{
struct bio_vec *bvec;
@@ -71,6 +91,7 @@ static void f2fs_write_end_io(struct bio *bio)
bio_for_each_segment_all(bvec, bio, i) {
struct page *page = bvec->bv_page;
+ enum count_type type = WB_DATA_TYPE(page);
fscrypt_pullback_bio_page(&page, true);
@@ -78,9 +99,11 @@ static void f2fs_write_end_io(struct bio *bio)
mapping_set_error(page->mapping, -EIO);
f2fs_stop_checkpoint(sbi, true);
}
+ dec_page_count(sbi, type);
+ clear_cold_data(page);
end_page_writeback(page);
}
- if (atomic_dec_and_test(&sbi->nr_wb_bios) &&
+ if (!get_pages(sbi, F2FS_WB_CP_DATA) &&
wq_has_sleeper(&sbi->cp_wait))
wake_up(&sbi->cp_wait);
@@ -88,6 +111,46 @@ static void f2fs_write_end_io(struct bio *bio)
}
/*
+ * Return true, if pre_bio's bdev is same as its target device.
+ */
+struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi,
+ block_t blk_addr, struct bio *bio)
+{
+ struct block_device *bdev = sbi->sb->s_bdev;
+ int i;
+
+ for (i = 0; i < sbi->s_ndevs; i++) {
+ if (FDEV(i).start_blk <= blk_addr &&
+ FDEV(i).end_blk >= blk_addr) {
+ blk_addr -= FDEV(i).start_blk;
+ bdev = FDEV(i).bdev;
+ break;
+ }
+ }
+ if (bio) {
+ bio->bi_bdev = bdev;
+ bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blk_addr);
+ }
+ return bdev;
+}
+
+int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr)
+{
+ int i;
+
+ for (i = 0; i < sbi->s_ndevs; i++)
+ if (FDEV(i).start_blk <= blkaddr && FDEV(i).end_blk >= blkaddr)
+ return i;
+ return 0;
+}
+
+static bool __same_bdev(struct f2fs_sb_info *sbi,
+ block_t blk_addr, struct bio *bio)
+{
+ return f2fs_target_device(sbi, blk_addr, NULL) == bio->bi_bdev;
+}
+
+/*
* Low-level block read/write IO operations.
*/
static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr,
@@ -97,8 +160,7 @@ static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr,
bio = f2fs_bio_alloc(npages);
- bio->bi_bdev = sbi->sb->s_bdev;
- bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blk_addr);
+ f2fs_target_device(sbi, blk_addr, bio);
bio->bi_end_io = is_read ? f2fs_read_end_io : f2fs_write_end_io;
bio->bi_private = is_read ? NULL : sbi;
@@ -109,8 +171,7 @@ static inline void __submit_bio(struct f2fs_sb_info *sbi,
struct bio *bio, enum page_type type)
{
if (!is_read_io(bio_op(bio))) {
- atomic_inc(&sbi->nr_wb_bios);
- if (f2fs_sb_mounted_hmsmr(sbi->sb) &&
+ if (f2fs_sb_mounted_blkzoned(sbi->sb) &&
current->plug && (type == DATA || type == NODE))
blk_finish_plug(current->plug);
}
@@ -198,11 +259,9 @@ static void __f2fs_submit_merged_bio(struct f2fs_sb_info *sbi,
if (type >= META_FLUSH) {
io->fio.type = META_FLUSH;
io->fio.op = REQ_OP_WRITE;
- if (test_opt(sbi, NOBARRIER))
- io->fio.op_flags = WRITE_FLUSH | REQ_META | REQ_PRIO;
- else
- io->fio.op_flags = WRITE_FLUSH_FUA | REQ_META |
- REQ_PRIO;
+ io->fio.op_flags = REQ_PREFLUSH | REQ_META | REQ_PRIO;
+ if (!test_opt(sbi, NOBARRIER))
+ io->fio.op_flags |= REQ_FUA;
}
__submit_merged_bio(io);
out:
@@ -270,22 +329,24 @@ void f2fs_submit_page_mbio(struct f2fs_io_info *fio)
verify_block_addr(sbi, fio->old_blkaddr);
verify_block_addr(sbi, fio->new_blkaddr);
+ bio_page = fio->encrypted_page ? fio->encrypted_page : fio->page;
+
+ if (!is_read)
+ inc_page_count(sbi, WB_DATA_TYPE(bio_page));
+
down_write(&io->io_rwsem);
if (io->bio && (io->last_block_in_bio != fio->new_blkaddr - 1 ||
- (io->fio.op != fio->op || io->fio.op_flags != fio->op_flags)))
+ (io->fio.op != fio->op || io->fio.op_flags != fio->op_flags) ||
+ !__same_bdev(sbi, fio->new_blkaddr, io->bio)))
__submit_merged_bio(io);
alloc_new:
if (io->bio == NULL) {
- int bio_blocks = MAX_BIO_BLOCKS(sbi);
-
io->bio = __bio_alloc(sbi, fio->new_blkaddr,
- bio_blocks, is_read);
+ BIO_MAX_PAGES, is_read);
io->fio = *fio;
}
- bio_page = fio->encrypted_page ? fio->encrypted_page : fio->page;
-
if (bio_add_page(io->bio, bio_page, PAGE_SIZE, 0) <
PAGE_SIZE) {
__submit_merged_bio(io);
@@ -483,7 +544,7 @@ struct page *find_data_page(struct inode *inode, pgoff_t index)
return page;
f2fs_put_page(page, 0);
- page = get_read_data_page(inode, index, READ_SYNC, false);
+ page = get_read_data_page(inode, index, 0, false);
if (IS_ERR(page))
return page;
@@ -509,7 +570,7 @@ struct page *get_lock_data_page(struct inode *inode, pgoff_t index,
struct address_space *mapping = inode->i_mapping;
struct page *page;
repeat:
- page = get_read_data_page(inode, index, READ_SYNC, for_write);
+ page = get_read_data_page(inode, index, 0, for_write);
if (IS_ERR(page))
return page;
@@ -590,7 +651,6 @@ static int __allocate_data_block(struct dnode_of_data *dn)
struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
struct f2fs_summary sum;
struct node_info ni;
- int seg = CURSEG_WARM_DATA;
pgoff_t fofs;
blkcnt_t count = 1;
@@ -608,11 +668,8 @@ alloc:
get_node_info(sbi, dn->nid, &ni);
set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version);
- if (dn->ofs_in_node == 0 && dn->inode_page == dn->node_page)
- seg = CURSEG_DIRECT_IO;
-
allocate_data_block(sbi, NULL, dn->data_blkaddr, &dn->data_blkaddr,
- &sum, seg);
+ &sum, CURSEG_WARM_DATA);
set_data_blkaddr(dn);
/* update i_size */
@@ -624,11 +681,18 @@ alloc:
return 0;
}
-ssize_t f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from)
+static inline bool __force_buffered_io(struct inode *inode, int rw)
+{
+ return ((f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) ||
+ (rw == WRITE && test_opt(F2FS_I_SB(inode), LFS)) ||
+ F2FS_I_SB(inode)->s_ndevs);
+}
+
+int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from)
{
struct inode *inode = file_inode(iocb->ki_filp);
struct f2fs_map_blocks map;
- ssize_t ret = 0;
+ int err = 0;
map.m_lblk = F2FS_BLK_ALIGN(iocb->ki_pos);
map.m_len = F2FS_BYTES_TO_BLK(iocb->ki_pos + iov_iter_count(from));
@@ -640,19 +704,22 @@ ssize_t f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from)
map.m_next_pgofs = NULL;
if (iocb->ki_flags & IOCB_DIRECT) {
- ret = f2fs_convert_inline_inode(inode);
- if (ret)
- return ret;
- return f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_DIO);
+ err = f2fs_convert_inline_inode(inode);
+ if (err)
+ return err;
+ return f2fs_map_blocks(inode, &map, 1,
+ __force_buffered_io(inode, WRITE) ?
+ F2FS_GET_BLOCK_PRE_AIO :
+ F2FS_GET_BLOCK_PRE_DIO);
}
if (iocb->ki_pos + iov_iter_count(from) > MAX_INLINE_DATA) {
- ret = f2fs_convert_inline_inode(inode);
- if (ret)
- return ret;
+ err = f2fs_convert_inline_inode(inode);
+ if (err)
+ return err;
}
if (!f2fs_has_inline_data(inode))
return f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO);
- return ret;
+ return err;
}
/*
@@ -676,7 +743,6 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
unsigned int ofs_in_node, last_ofs_in_node;
blkcnt_t prealloc;
struct extent_info ei;
- bool allocated = false;
block_t blkaddr;
if (!maxblocks)
@@ -716,7 +782,7 @@ next_dnode:
}
prealloc = 0;
- ofs_in_node = dn.ofs_in_node;
+ last_ofs_in_node = ofs_in_node = dn.ofs_in_node;
end_offset = ADDRS_PER_PAGE(dn.node_page, inode);
next_block:
@@ -735,10 +801,8 @@ next_block:
}
} else {
err = __allocate_data_block(&dn);
- if (!err) {
+ if (!err)
set_inode_flag(inode, FI_APPEND_WRITE);
- allocated = true;
- }
}
if (err)
goto sync_out;
@@ -793,7 +857,6 @@ skip:
err = reserve_new_blocks(&dn, prealloc);
if (err)
goto sync_out;
- allocated = dn.node_changed;
map->m_len += dn.ofs_in_node - ofs_in_node;
if (prealloc && dn.ofs_in_node != last_ofs_in_node + 1) {
@@ -812,9 +875,8 @@ skip:
if (create) {
f2fs_unlock_op(sbi);
- f2fs_balance_fs(sbi, allocated);
+ f2fs_balance_fs(sbi, dn.node_changed);
}
- allocated = false;
goto next_dnode;
sync_out:
@@ -822,7 +884,7 @@ sync_out:
unlock_out:
if (create) {
f2fs_unlock_op(sbi);
- f2fs_balance_fs(sbi, allocated);
+ f2fs_balance_fs(sbi, dn.node_changed);
}
out:
trace_f2fs_map_blocks(inode, map, err);
@@ -834,19 +896,19 @@ static int __get_data_block(struct inode *inode, sector_t iblock,
pgoff_t *next_pgofs)
{
struct f2fs_map_blocks map;
- int ret;
+ int err;
map.m_lblk = iblock;
map.m_len = bh->b_size >> inode->i_blkbits;
map.m_next_pgofs = next_pgofs;
- ret = f2fs_map_blocks(inode, &map, create, flag);
- if (!ret) {
+ err = f2fs_map_blocks(inode, &map, create, flag);
+ if (!err) {
map_bh(bh, inode->i_sb, map.m_pblk);
bh->b_state = (bh->b_state & ~F2FS_MAP_FLAGS) | map.m_flags;
bh->b_size = map.m_len << inode->i_blkbits;
}
- return ret;
+ return err;
}
static int get_data_block(struct inode *inode, sector_t iblock,
@@ -891,7 +953,6 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
struct buffer_head map_bh;
sector_t start_blk, last_blk;
pgoff_t next_pgofs;
- loff_t isize;
u64 logical = 0, phys = 0, size = 0;
u32 flags = 0;
int ret = 0;
@@ -908,13 +969,6 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
inode_lock(inode);
- isize = i_size_read(inode);
- if (start >= isize)
- goto out;
-
- if (start + len > isize)
- len = isize - start;
-
if (logical_to_blk(inode, len) == 0)
len = blk_to_logical(inode, 1);
@@ -933,13 +987,11 @@ next:
/* HOLE */
if (!buffer_mapped(&map_bh)) {
start_blk = next_pgofs;
- /* Go through holes util pass the EOF */
- if (blk_to_logical(inode, start_blk) < isize)
+
+ if (blk_to_logical(inode, start_blk) < blk_to_logical(inode,
+ F2FS_I_SB(inode)->max_file_blocks))
goto prep_next;
- /* Found a hole beyond isize means no more extents.
- * Note that the premise is that filesystems don't
- * punch holes beyond isize and keep size unchanged.
- */
+
flags |= FIEMAP_EXTENT_LAST;
}
@@ -982,7 +1034,6 @@ static struct bio *f2fs_grab_bio(struct inode *inode, block_t blkaddr,
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct fscrypt_ctx *ctx = NULL;
- struct block_device *bdev = sbi->sb->s_bdev;
struct bio *bio;
if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) {
@@ -1000,8 +1051,7 @@ static struct bio *f2fs_grab_bio(struct inode *inode, block_t blkaddr,
fscrypt_release_ctx(ctx);
return ERR_PTR(-ENOMEM);
}
- bio->bi_bdev = bdev;
- bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blkaddr);
+ f2fs_target_device(sbi, blkaddr, bio);
bio->bi_end_io = f2fs_read_end_io;
bio->bi_private = ctx;
@@ -1096,7 +1146,8 @@ got_it:
* This page will go to BIO. Do we need to send this
* BIO off first?
*/
- if (bio && (last_block_in_bio != block_nr - 1)) {
+ if (bio && (last_block_in_bio != block_nr - 1 ||
+ !__same_bdev(F2FS_I_SB(inode), block_nr, bio))) {
submit_and_realloc:
__submit_bio(F2FS_I_SB(inode), bio, DATA);
bio = NULL;
@@ -1195,7 +1246,9 @@ int do_write_data_page(struct f2fs_io_info *fio)
fio->old_blkaddr);
retry_encrypt:
fio->encrypted_page = fscrypt_encrypt_page(inode, fio->page,
- gfp_flags);
+ PAGE_SIZE, 0,
+ fio->page->index,
+ gfp_flags);
if (IS_ERR(fio->encrypted_page)) {
err = PTR_ERR(fio->encrypted_page);
if (err == -ENOMEM) {
@@ -1251,7 +1304,7 @@ static int f2fs_write_data_page(struct page *page,
.sbi = sbi,
.type = DATA,
.op = REQ_OP_WRITE,
- .op_flags = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : 0,
+ .op_flags = wbc_to_write_flags(wbc),
.page = page,
.encrypted_page = NULL,
};
@@ -1311,7 +1364,6 @@ done:
if (err && err != -ENOENT)
goto redirty_out;
- clear_cold_data(page);
out:
inode_dec_dirty_pages(inode);
if (err)
@@ -1332,6 +1384,8 @@ out:
redirty_out:
redirty_page_for_writepage(wbc, page);
+ if (!err)
+ return AOP_WRITEPAGE_ACTIVATE;
unlock_page(page);
return err;
}
@@ -1427,6 +1481,15 @@ continue_unlock:
ret = mapping->a_ops->writepage(page, wbc);
if (unlikely(ret)) {
+ /*
+ * keep nr_to_write, since vfs uses this to
+ * get # of written pages.
+ */
+ if (ret == AOP_WRITEPAGE_ACTIVATE) {
+ unlock_page(page);
+ ret = 0;
+ continue;
+ }
done_index = page->index + 1;
done = 1;
break;
@@ -1663,7 +1726,7 @@ repeat:
err = PTR_ERR(bio);
goto fail;
}
- bio_set_op_attrs(bio, REQ_OP_READ, READ_SYNC);
+ bio->bi_opf = REQ_OP_READ;
if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
bio_put(bio);
err = -EFAULT;
@@ -1714,7 +1777,6 @@ static int f2fs_write_end(struct file *file,
goto unlock_out;
set_page_dirty(page);
- clear_cold_data(page);
if (pos + copied > i_size_read(inode))
f2fs_i_size_write(inode, pos + copied);
@@ -1751,9 +1813,7 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
if (err)
return err;
- if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
- return 0;
- if (test_opt(F2FS_I_SB(inode), LFS))
+ if (__force_buffered_io(inode, rw))
return 0;
trace_f2fs_direct_IO_enter(inode, offset, count, rw);
@@ -1785,12 +1845,14 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset,
return;
if (PageDirty(page)) {
- if (inode->i_ino == F2FS_META_INO(sbi))
+ if (inode->i_ino == F2FS_META_INO(sbi)) {
dec_page_count(sbi, F2FS_DIRTY_META);
- else if (inode->i_ino == F2FS_NODE_INO(sbi))
+ } else if (inode->i_ino == F2FS_NODE_INO(sbi)) {
dec_page_count(sbi, F2FS_DIRTY_NODES);
- else
+ } else {
inode_dec_dirty_pages(inode);
+ remove_dirty_inode(inode);
+ }
}
/* This is atomic written page, keep Private */
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index fb245bd302e4..fbd5184140d0 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -50,7 +50,8 @@ static void update_general_status(struct f2fs_sb_info *sbi)
si->ndirty_files = sbi->ndirty_inode[FILE_INODE];
si->ndirty_all = sbi->ndirty_inode[DIRTY_META];
si->inmem_pages = get_pages(sbi, F2FS_INMEM_PAGES);
- si->wb_bios = atomic_read(&sbi->nr_wb_bios);
+ si->nr_wb_cp_data = get_pages(sbi, F2FS_WB_CP_DATA);
+ si->nr_wb_data = get_pages(sbi, F2FS_WB_DATA);
si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg;
si->rsvd_segs = reserved_segments(sbi);
si->overp_segs = overprovision_segments(sbi);
@@ -74,7 +75,8 @@ static void update_general_status(struct f2fs_sb_info *sbi)
si->dirty_nats = NM_I(sbi)->dirty_nat_cnt;
si->sits = MAIN_SEGS(sbi);
si->dirty_sits = SIT_I(sbi)->dirty_sentries;
- si->fnids = NM_I(sbi)->fcnt;
+ si->free_nids = NM_I(sbi)->nid_cnt[FREE_NID_LIST];
+ si->alloc_nids = NM_I(sbi)->nid_cnt[ALLOC_NID_LIST];
si->bg_gc = sbi->bg_gc;
si->util_free = (int)(free_user_blocks(sbi) >> sbi->log_blocks_per_seg)
* 100 / (int)(sbi->user_block_count >> sbi->log_blocks_per_seg)
@@ -194,7 +196,9 @@ get_cache:
si->cache_mem += sizeof(struct flush_cmd_control);
/* free nids */
- si->cache_mem += NM_I(sbi)->fcnt * sizeof(struct free_nid);
+ si->cache_mem += (NM_I(sbi)->nid_cnt[FREE_NID_LIST] +
+ NM_I(sbi)->nid_cnt[ALLOC_NID_LIST]) *
+ sizeof(struct free_nid);
si->cache_mem += NM_I(sbi)->nat_cnt * sizeof(struct nat_entry);
si->cache_mem += NM_I(sbi)->dirty_nat_cnt *
sizeof(struct nat_entry_set);
@@ -310,22 +314,22 @@ static int stat_show(struct seq_file *s, void *v)
seq_printf(s, " - Inner Struct Count: tree: %d(%d), node: %d\n",
si->ext_tree, si->zombie_tree, si->ext_node);
seq_puts(s, "\nBalancing F2FS Async:\n");
- seq_printf(s, " - inmem: %4lld, wb_bios: %4d\n",
- si->inmem_pages, si->wb_bios);
- seq_printf(s, " - nodes: %4lld in %4d\n",
+ seq_printf(s, " - inmem: %4d, wb_cp_data: %4d, wb_data: %4d\n",
+ si->inmem_pages, si->nr_wb_cp_data, si->nr_wb_data);
+ seq_printf(s, " - nodes: %4d in %4d\n",
si->ndirty_node, si->node_pages);
- seq_printf(s, " - dents: %4lld in dirs:%4d (%4d)\n",
+ seq_printf(s, " - dents: %4d in dirs:%4d (%4d)\n",
si->ndirty_dent, si->ndirty_dirs, si->ndirty_all);
- seq_printf(s, " - datas: %4lld in files:%4d\n",
+ seq_printf(s, " - datas: %4d in files:%4d\n",
si->ndirty_data, si->ndirty_files);
- seq_printf(s, " - meta: %4lld in %4d\n",
+ seq_printf(s, " - meta: %4d in %4d\n",
si->ndirty_meta, si->meta_pages);
- seq_printf(s, " - imeta: %4lld\n",
+ seq_printf(s, " - imeta: %4d\n",
si->ndirty_imeta);
seq_printf(s, " - NATs: %9d/%9d\n - SITs: %9d/%9d\n",
si->dirty_nats, si->nats, si->dirty_sits, si->sits);
- seq_printf(s, " - free_nids: %9d\n",
- si->fnids);
+ seq_printf(s, " - free_nids: %9d, alloc_nids: %9d\n",
+ si->free_nids, si->alloc_nids);
seq_puts(s, "\nDistribution of User Blocks:");
seq_puts(s, " [ valid | invalid | free ]\n");
seq_puts(s, " [");
@@ -373,6 +377,7 @@ static int stat_open(struct inode *inode, struct file *file)
}
static const struct file_operations stat_fops = {
+ .owner = THIS_MODULE,
.open = stat_open,
.read = seq_read,
.llseek = seq_lseek,
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 369f4513be37..827c5daef4fc 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -136,7 +136,7 @@ struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *fname,
/* show encrypted name */
if (fname->hash) {
- if (de->hash_code == fname->hash)
+ if (de->hash_code == cpu_to_le32(fname->hash))
goto found;
} else if (de_name.len == name->len &&
de->hash_code == namehash &&
@@ -313,7 +313,7 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de,
set_page_dirty(page);
dir->i_mtime = dir->i_ctime = current_time(dir);
- f2fs_mark_inode_dirty_sync(dir);
+ f2fs_mark_inode_dirty_sync(dir, false);
f2fs_put_page(page, 1);
}
@@ -466,7 +466,7 @@ void update_parent_metadata(struct inode *dir, struct inode *inode,
clear_inode_flag(inode, FI_NEW_INODE);
}
dir->i_mtime = dir->i_ctime = current_time(dir);
- f2fs_mark_inode_dirty_sync(dir);
+ f2fs_mark_inode_dirty_sync(dir, false);
if (F2FS_I(dir)->i_current_depth != current_depth)
f2fs_i_depth_write(dir, current_depth);
@@ -731,7 +731,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
set_page_dirty(page);
dir->i_ctime = dir->i_mtime = current_time(dir);
- f2fs_mark_inode_dirty_sync(dir);
+ f2fs_mark_inode_dirty_sync(dir, false);
if (inode)
f2fs_drop_nlink(dir, inode);
@@ -742,6 +742,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
ClearPagePrivate(page);
ClearPageUptodate(page);
inode_dec_dirty_pages(dir);
+ remove_dirty_inode(dir);
}
f2fs_put_page(page, 1);
}
@@ -784,7 +785,7 @@ bool f2fs_empty_dir(struct inode *dir)
return true;
}
-bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
+int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
unsigned int start_pos, struct fscrypt_str *fstr)
{
unsigned char d_type = DT_UNKNOWN;
@@ -819,7 +820,7 @@ bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
(u32)de->hash_code, 0,
&de_name, fstr);
if (err)
- return true;
+ return err;
de_name = *fstr;
fstr->len = save_len;
@@ -827,12 +828,12 @@ bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
if (!dir_emit(ctx, de_name.name, de_name.len,
le32_to_cpu(de->ino), d_type))
- return true;
+ return 1;
bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len));
ctx->pos = start_pos + bit_pos;
}
- return false;
+ return 0;
}
static int f2fs_readdir(struct file *file, struct dir_context *ctx)
@@ -871,17 +872,21 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
dentry_page = get_lock_data_page(inode, n, false);
if (IS_ERR(dentry_page)) {
err = PTR_ERR(dentry_page);
- if (err == -ENOENT)
+ if (err == -ENOENT) {
+ err = 0;
continue;
- else
+ } else {
goto out;
+ }
}
dentry_blk = kmap(dentry_page);
make_dentry_ptr(inode, &d, (void *)dentry_blk, 1);
- if (f2fs_fill_dentries(ctx, &d, n * NR_DENTRY_IN_BLOCK, &fstr)) {
+ err = f2fs_fill_dentries(ctx, &d,
+ n * NR_DENTRY_IN_BLOCK, &fstr);
+ if (err) {
kunmap(dentry_page);
f2fs_put_page(dentry_page, 1);
break;
@@ -891,10 +896,9 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
kunmap(dentry_page);
f2fs_put_page(dentry_page, 1);
}
- err = 0;
out:
fscrypt_fname_free_buffer(&fstr);
- return err;
+ return err < 0 ? err : 0;
}
static int f2fs_dir_open(struct inode *inode, struct file *filp)
diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c
index 2b06d4fcd954..4db44da7ef69 100644
--- a/fs/f2fs/extent_cache.c
+++ b/fs/f2fs/extent_cache.c
@@ -172,7 +172,7 @@ static void __drop_largest_extent(struct inode *inode,
if (fofs < largest->fofs + largest->len && fofs + len > largest->fofs) {
largest->len = 0;
- f2fs_mark_inode_dirty_sync(inode);
+ f2fs_mark_inode_dirty_sync(inode, true);
}
}
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 9e8de18a168a..2da8c3aa0ce5 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -103,7 +103,7 @@ struct f2fs_mount_info {
};
#define F2FS_FEATURE_ENCRYPT 0x0001
-#define F2FS_FEATURE_HMSMR 0x0002
+#define F2FS_FEATURE_BLKZONED 0x0002
#define F2FS_HAS_FEATURE(sb, mask) \
((F2FS_SB(sb)->raw_super->feature & cpu_to_le32(mask)) != 0)
@@ -401,6 +401,7 @@ struct f2fs_map_blocks {
#define FADVISE_LOST_PINO_BIT 0x02
#define FADVISE_ENCRYPT_BIT 0x04
#define FADVISE_ENC_NAME_BIT 0x08
+#define FADVISE_KEEP_SIZE_BIT 0x10
#define file_is_cold(inode) is_file(inode, FADVISE_COLD_BIT)
#define file_wrong_pino(inode) is_file(inode, FADVISE_LOST_PINO_BIT)
@@ -413,6 +414,8 @@ struct f2fs_map_blocks {
#define file_clear_encrypt(inode) clear_file(inode, FADVISE_ENCRYPT_BIT)
#define file_enc_name(inode) is_file(inode, FADVISE_ENC_NAME_BIT)
#define file_set_enc_name(inode) set_file(inode, FADVISE_ENC_NAME_BIT)
+#define file_keep_isize(inode) is_file(inode, FADVISE_KEEP_SIZE_BIT)
+#define file_set_keep_isize(inode) set_file(inode, FADVISE_KEEP_SIZE_BIT)
#define DEF_DIR_LEVEL 0
@@ -428,7 +431,7 @@ struct f2fs_inode_info {
/* Use below internally in f2fs*/
unsigned long flags; /* use to pass per-file flags */
struct rw_semaphore i_sem; /* protect fi info */
- struct percpu_counter dirty_pages; /* # of dirty pages */
+ atomic_t dirty_pages; /* # of dirty pages */
f2fs_hash_t chash; /* hash value of given file name */
unsigned int clevel; /* maximum level of given file name */
nid_t i_xattr_nid; /* node id that contains xattrs */
@@ -493,20 +496,26 @@ static inline bool __is_front_mergeable(struct extent_info *cur,
return __is_extent_mergeable(cur, front);
}
-extern void f2fs_mark_inode_dirty_sync(struct inode *);
+extern void f2fs_mark_inode_dirty_sync(struct inode *, bool);
static inline void __try_update_largest_extent(struct inode *inode,
struct extent_tree *et, struct extent_node *en)
{
if (en->ei.len > et->largest.len) {
et->largest = en->ei;
- f2fs_mark_inode_dirty_sync(inode);
+ f2fs_mark_inode_dirty_sync(inode, true);
}
}
+enum nid_list {
+ FREE_NID_LIST,
+ ALLOC_NID_LIST,
+ MAX_NID_LIST,
+};
+
struct f2fs_nm_info {
block_t nat_blkaddr; /* base disk address of NAT */
nid_t max_nid; /* maximum possible node ids */
- nid_t available_nids; /* maximum available node ids */
+ nid_t available_nids; /* # of available node ids */
nid_t next_scan_nid; /* the next nid to be scanned */
unsigned int ram_thresh; /* control the memory footprint */
unsigned int ra_nid_pages; /* # of nid pages to be readaheaded */
@@ -522,9 +531,9 @@ struct f2fs_nm_info {
/* free node ids management */
struct radix_tree_root free_nid_root;/* root of the free_nid cache */
- struct list_head free_nid_list; /* a list for free nids */
- spinlock_t free_nid_list_lock; /* protect free nid list */
- unsigned int fcnt; /* the number of free node id */
+ struct list_head nid_list[MAX_NID_LIST];/* lists for free nids */
+ unsigned int nid_cnt[MAX_NID_LIST]; /* the number of free node id */
+ spinlock_t nid_list_lock; /* protect nid lists ops */
struct mutex build_lock; /* lock for build free nids */
/* for checkpoint */
@@ -585,7 +594,6 @@ enum {
CURSEG_WARM_NODE, /* direct node blocks of normal files */
CURSEG_COLD_NODE, /* indirect node blocks */
NO_CHECK_TYPE,
- CURSEG_DIRECT_IO, /* to use for the direct IO path */
};
struct flush_cmd {
@@ -649,6 +657,7 @@ struct f2fs_sm_info {
* f2fs monitors the number of several block types such as on-writeback,
* dirty dentry blocks, dirty node blocks, and dirty meta blocks.
*/
+#define WB_DATA_TYPE(p) (__is_cp_guaranteed(p) ? F2FS_WB_CP_DATA : F2FS_WB_DATA)
enum count_type {
F2FS_DIRTY_DENTS,
F2FS_DIRTY_DATA,
@@ -656,6 +665,8 @@ enum count_type {
F2FS_DIRTY_META,
F2FS_INMEM_PAGES,
F2FS_DIRTY_IMETA,
+ F2FS_WB_CP_DATA,
+ F2FS_WB_DATA,
NR_COUNT_TYPE,
};
@@ -688,7 +699,7 @@ struct f2fs_io_info {
struct f2fs_sb_info *sbi; /* f2fs_sb_info pointer */
enum page_type type; /* contains DATA/NODE/META/META_FLUSH */
int op; /* contains REQ_OP_ */
- int op_flags; /* rq_flag_bits */
+ int op_flags; /* req_flag_bits */
block_t new_blkaddr; /* new block address to be written */
block_t old_blkaddr; /* old block address before Cow */
struct page *page; /* page to be written */
@@ -704,6 +715,20 @@ struct f2fs_bio_info {
struct rw_semaphore io_rwsem; /* blocking op for bio */
};
+#define FDEV(i) (sbi->devs[i])
+#define RDEV(i) (raw_super->devs[i])
+struct f2fs_dev_info {
+ struct block_device *bdev;
+ char path[MAX_PATH_LEN];
+ unsigned int total_segments;
+ block_t start_blk;
+ block_t end_blk;
+#ifdef CONFIG_BLK_DEV_ZONED
+ unsigned int nr_blkz; /* Total number of zones */
+ u8 *blkz_type; /* Array of zones type */
+#endif
+};
+
enum inode_type {
DIR_INODE, /* for dirty dir inode */
FILE_INODE, /* for dirty regular/symlink inode */
@@ -750,6 +775,12 @@ struct f2fs_sb_info {
u8 key_prefix[F2FS_KEY_DESC_PREFIX_SIZE];
u8 key_prefix_size;
#endif
+
+#ifdef CONFIG_BLK_DEV_ZONED
+ unsigned int blocks_per_blkz; /* F2FS blocks per zone */
+ unsigned int log_blocks_per_blkz; /* log2 F2FS blocks per zone */
+#endif
+
/* for node-related operations */
struct f2fs_nm_info *nm_info; /* node manager */
struct inode *node_inode; /* cache node blocks */
@@ -764,6 +795,7 @@ struct f2fs_sb_info {
/* for checkpoint */
struct f2fs_checkpoint *ckpt; /* raw checkpoint pointer */
+ int cur_cp_pack; /* remain current cp pack */
spinlock_t cp_lock; /* for flag in ckpt */
struct inode *meta_inode; /* cache meta blocks */
struct mutex cp_mutex; /* checkpoint procedure lock */
@@ -815,10 +847,9 @@ struct f2fs_sb_info {
block_t discard_blks; /* discard command candidats */
block_t last_valid_block_count; /* for recovery */
u32 s_next_generation; /* for NFS support */
- atomic_t nr_wb_bios; /* # of writeback bios */
/* # of pages, see count_type */
- struct percpu_counter nr_pages[NR_COUNT_TYPE];
+ atomic_t nr_pages[NR_COUNT_TYPE];
/* # of allocated blocks */
struct percpu_counter alloc_valid_block_count;
@@ -863,6 +894,8 @@ struct f2fs_sb_info {
/* For shrinker support */
struct list_head s_list;
+ int s_ndevs; /* number of devices */
+ struct f2fs_dev_info *devs; /* for device list */
struct mutex umount_mutex;
unsigned int shrinker_run_no;
@@ -1105,13 +1138,6 @@ static inline void clear_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f)
spin_unlock(&sbi->cp_lock);
}
-static inline bool f2fs_discard_en(struct f2fs_sb_info *sbi)
-{
- struct request_queue *q = bdev_get_queue(sbi->sb->s_bdev);
-
- return blk_queue_discard(q);
-}
-
static inline void f2fs_lock_op(struct f2fs_sb_info *sbi)
{
down_read(&sbi->cp_rwsem);
@@ -1232,9 +1258,10 @@ static inline void dec_valid_block_count(struct f2fs_sb_info *sbi,
static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type)
{
- percpu_counter_inc(&sbi->nr_pages[count_type]);
+ atomic_inc(&sbi->nr_pages[count_type]);
- if (count_type == F2FS_DIRTY_DATA || count_type == F2FS_INMEM_PAGES)
+ if (count_type == F2FS_DIRTY_DATA || count_type == F2FS_INMEM_PAGES ||
+ count_type == F2FS_WB_CP_DATA || count_type == F2FS_WB_DATA)
return;
set_sbi_flag(sbi, SBI_IS_DIRTY);
@@ -1242,14 +1269,14 @@ static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type)
static inline void inode_inc_dirty_pages(struct inode *inode)
{
- percpu_counter_inc(&F2FS_I(inode)->dirty_pages);
+ atomic_inc(&F2FS_I(inode)->dirty_pages);
inc_page_count(F2FS_I_SB(inode), S_ISDIR(inode->i_mode) ?
F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA);
}
static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type)
{
- percpu_counter_dec(&sbi->nr_pages[count_type]);
+ atomic_dec(&sbi->nr_pages[count_type]);
}
static inline void inode_dec_dirty_pages(struct inode *inode)
@@ -1258,19 +1285,19 @@ static inline void inode_dec_dirty_pages(struct inode *inode)
!S_ISLNK(inode->i_mode))
return;
- percpu_counter_dec(&F2FS_I(inode)->dirty_pages);
+ atomic_dec(&F2FS_I(inode)->dirty_pages);
dec_page_count(F2FS_I_SB(inode), S_ISDIR(inode->i_mode) ?
F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA);
}
static inline s64 get_pages(struct f2fs_sb_info *sbi, int count_type)
{
- return percpu_counter_sum_positive(&sbi->nr_pages[count_type]);
+ return atomic_read(&sbi->nr_pages[count_type]);
}
-static inline s64 get_dirty_pages(struct inode *inode)
+static inline int get_dirty_pages(struct inode *inode)
{
- return percpu_counter_sum_positive(&F2FS_I(inode)->dirty_pages);
+ return atomic_read(&F2FS_I(inode)->dirty_pages);
}
static inline int get_blocktype_secs(struct f2fs_sb_info *sbi, int block_type)
@@ -1329,22 +1356,27 @@ static inline void *__bitmap_ptr(struct f2fs_sb_info *sbi, int flag)
static inline block_t __start_cp_addr(struct f2fs_sb_info *sbi)
{
- block_t start_addr;
- struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
- unsigned long long ckpt_version = cur_cp_version(ckpt);
-
- start_addr = le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_blkaddr);
+ block_t start_addr = le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_blkaddr);
- /*
- * odd numbered checkpoint should at cp segment 0
- * and even segment must be at cp segment 1
- */
- if (!(ckpt_version & 1))
+ if (sbi->cur_cp_pack == 2)
start_addr += sbi->blocks_per_seg;
+ return start_addr;
+}
+
+static inline block_t __start_cp_next_addr(struct f2fs_sb_info *sbi)
+{
+ block_t start_addr = le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_blkaddr);
+ if (sbi->cur_cp_pack == 1)
+ start_addr += sbi->blocks_per_seg;
return start_addr;
}
+static inline void __set_cp_next_pack(struct f2fs_sb_info *sbi)
+{
+ sbi->cur_cp_pack = (sbi->cur_cp_pack == 1) ? 2 : 1;
+}
+
static inline block_t __start_sum_addr(struct f2fs_sb_info *sbi)
{
return le32_to_cpu(F2FS_CKPT(sbi)->cp_pack_start_sum);
@@ -1621,7 +1653,7 @@ static inline void __mark_inode_dirty_flag(struct inode *inode,
return;
case FI_DATA_EXIST:
case FI_INLINE_DOTS:
- f2fs_mark_inode_dirty_sync(inode);
+ f2fs_mark_inode_dirty_sync(inode, true);
}
}
@@ -1648,7 +1680,7 @@ static inline void set_acl_inode(struct inode *inode, umode_t mode)
{
F2FS_I(inode)->i_acl_mode = mode;
set_inode_flag(inode, FI_ACL_MODE);
- f2fs_mark_inode_dirty_sync(inode);
+ f2fs_mark_inode_dirty_sync(inode, false);
}
static inline void f2fs_i_links_write(struct inode *inode, bool inc)
@@ -1657,7 +1689,7 @@ static inline void f2fs_i_links_write(struct inode *inode, bool inc)
inc_nlink(inode);
else
drop_nlink(inode);
- f2fs_mark_inode_dirty_sync(inode);
+ f2fs_mark_inode_dirty_sync(inode, true);
}
static inline void f2fs_i_blocks_write(struct inode *inode,
@@ -1668,7 +1700,7 @@ static inline void f2fs_i_blocks_write(struct inode *inode,
inode->i_blocks = add ? inode->i_blocks + diff :
inode->i_blocks - diff;
- f2fs_mark_inode_dirty_sync(inode);
+ f2fs_mark_inode_dirty_sync(inode, true);
if (clean || recover)
set_inode_flag(inode, FI_AUTO_RECOVER);
}
@@ -1682,34 +1714,27 @@ static inline void f2fs_i_size_write(struct inode *inode, loff_t i_size)
return;
i_size_write(inode, i_size);
- f2fs_mark_inode_dirty_sync(inode);
+ f2fs_mark_inode_dirty_sync(inode, true);
if (clean || recover)
set_inode_flag(inode, FI_AUTO_RECOVER);
}
-static inline bool f2fs_skip_inode_update(struct inode *inode)
-{
- if (!is_inode_flag_set(inode, FI_AUTO_RECOVER))
- return false;
- return F2FS_I(inode)->last_disk_size == i_size_read(inode);
-}
-
static inline void f2fs_i_depth_write(struct inode *inode, unsigned int depth)
{
F2FS_I(inode)->i_current_depth = depth;
- f2fs_mark_inode_dirty_sync(inode);
+ f2fs_mark_inode_dirty_sync(inode, true);
}
static inline void f2fs_i_xnid_write(struct inode *inode, nid_t xnid)
{
F2FS_I(inode)->i_xattr_nid = xnid;
- f2fs_mark_inode_dirty_sync(inode);
+ f2fs_mark_inode_dirty_sync(inode, true);
}
static inline void f2fs_i_pino_write(struct inode *inode, nid_t pino)
{
F2FS_I(inode)->i_pino = pino;
- f2fs_mark_inode_dirty_sync(inode);
+ f2fs_mark_inode_dirty_sync(inode, true);
}
static inline void get_inline_info(struct inode *inode, struct f2fs_inode *ri)
@@ -1837,13 +1862,31 @@ static inline int is_file(struct inode *inode, int type)
static inline void set_file(struct inode *inode, int type)
{
F2FS_I(inode)->i_advise |= type;
- f2fs_mark_inode_dirty_sync(inode);
+ f2fs_mark_inode_dirty_sync(inode, true);
}
static inline void clear_file(struct inode *inode, int type)
{
F2FS_I(inode)->i_advise &= ~type;
- f2fs_mark_inode_dirty_sync(inode);
+ f2fs_mark_inode_dirty_sync(inode, true);
+}
+
+static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync)
+{
+ if (dsync) {
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ bool ret;
+
+ spin_lock(&sbi->inode_lock[DIRTY_META]);
+ ret = list_empty(&F2FS_I(inode)->gdirty_list);
+ spin_unlock(&sbi->inode_lock[DIRTY_META]);
+ return ret;
+ }
+ if (!is_inode_flag_set(inode, FI_AUTO_RECOVER) ||
+ file_keep_isize(inode) ||
+ i_size_read(inode) & PAGE_MASK)
+ return false;
+ return F2FS_I(inode)->last_disk_size == i_size_read(inode);
}
static inline int f2fs_readonly(struct super_block *sb)
@@ -1955,7 +1998,7 @@ void set_de_type(struct f2fs_dir_entry *, umode_t);
unsigned char get_de_type(struct f2fs_dir_entry *);
struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *,
f2fs_hash_t, int *, struct f2fs_dentry_ptr *);
-bool f2fs_fill_dentries(struct dir_context *, struct f2fs_dentry_ptr *,
+int f2fs_fill_dentries(struct dir_context *, struct f2fs_dentry_ptr *,
unsigned int, struct fscrypt_str *);
void do_make_empty_dir(struct inode *, struct inode *,
struct f2fs_dentry_ptr *);
@@ -1995,7 +2038,7 @@ static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode)
/*
* super.c
*/
-int f2fs_inode_dirtied(struct inode *);
+int f2fs_inode_dirtied(struct inode *, bool);
void f2fs_inode_synced(struct inode *);
int f2fs_commit_super(struct f2fs_sb_info *, bool);
int f2fs_sync_fs(struct super_block *, int);
@@ -2034,7 +2077,7 @@ void move_node_page(struct page *, int);
int fsync_node_pages(struct f2fs_sb_info *, struct inode *,
struct writeback_control *, bool);
int sync_node_pages(struct f2fs_sb_info *, struct writeback_control *);
-void build_free_nids(struct f2fs_sb_info *);
+void build_free_nids(struct f2fs_sb_info *, bool);
bool alloc_nid(struct f2fs_sb_info *, nid_t *);
void alloc_nid_done(struct f2fs_sb_info *, nid_t);
void alloc_nid_failed(struct f2fs_sb_info *, nid_t);
@@ -2060,7 +2103,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *, bool);
void f2fs_balance_fs_bg(struct f2fs_sb_info *);
int f2fs_issue_flush(struct f2fs_sb_info *);
int create_flush_cmd_control(struct f2fs_sb_info *);
-void destroy_flush_cmd_control(struct f2fs_sb_info *);
+void destroy_flush_cmd_control(struct f2fs_sb_info *, bool);
void invalidate_blocks(struct f2fs_sb_info *, block_t);
bool is_checkpointed_data(struct f2fs_sb_info *, block_t);
void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t);
@@ -2132,12 +2175,15 @@ void f2fs_submit_merged_bio_cond(struct f2fs_sb_info *, struct inode *,
void f2fs_flush_merged_bios(struct f2fs_sb_info *);
int f2fs_submit_page_bio(struct f2fs_io_info *);
void f2fs_submit_page_mbio(struct f2fs_io_info *);
+struct block_device *f2fs_target_device(struct f2fs_sb_info *,
+ block_t, struct bio *);
+int f2fs_target_device_index(struct f2fs_sb_info *, block_t);
void set_data_blkaddr(struct dnode_of_data *);
void f2fs_update_data_blkaddr(struct dnode_of_data *, block_t);
int reserve_new_blocks(struct dnode_of_data *, blkcnt_t);
int reserve_new_block(struct dnode_of_data *);
int f2fs_get_block(struct dnode_of_data *, pgoff_t);
-ssize_t f2fs_preallocate_blocks(struct kiocb *, struct iov_iter *);
+int f2fs_preallocate_blocks(struct kiocb *, struct iov_iter *);
int f2fs_reserve_block(struct dnode_of_data *, pgoff_t);
struct page *get_read_data_page(struct inode *, pgoff_t, int, bool);
struct page *find_data_page(struct inode *, pgoff_t);
@@ -2160,7 +2206,7 @@ int f2fs_migrate_page(struct address_space *, struct page *, struct page *,
int start_gc_thread(struct f2fs_sb_info *);
void stop_gc_thread(struct f2fs_sb_info *);
block_t start_bidx_of_node(unsigned int, struct inode *);
-int f2fs_gc(struct f2fs_sb_info *, bool);
+int f2fs_gc(struct f2fs_sb_info *, bool, bool);
void build_gc_manager(struct f2fs_sb_info *);
/*
@@ -2181,12 +2227,12 @@ struct f2fs_stat_info {
unsigned long long hit_largest, hit_cached, hit_rbtree;
unsigned long long hit_total, total_ext;
int ext_tree, zombie_tree, ext_node;
- s64 ndirty_node, ndirty_dent, ndirty_meta, ndirty_data, ndirty_imeta;
- s64 inmem_pages;
+ int ndirty_node, ndirty_dent, ndirty_meta, ndirty_data, ndirty_imeta;
+ int inmem_pages;
unsigned int ndirty_dirs, ndirty_files, ndirty_all;
- int nats, dirty_nats, sits, dirty_sits, fnids;
+ int nats, dirty_nats, sits, dirty_sits, free_nids, alloc_nids;
int total_count, utilization;
- int bg_gc, wb_bios;
+ int bg_gc, nr_wb_cp_data, nr_wb_data;
int inline_xattr, inline_inode, inline_dir, orphans;
unsigned int valid_count, valid_node_count, valid_inode_count, discard_blks;
unsigned int bimodal, avg_vblocks;
@@ -2412,9 +2458,30 @@ static inline int f2fs_sb_has_crypto(struct super_block *sb)
return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_ENCRYPT);
}
-static inline int f2fs_sb_mounted_hmsmr(struct super_block *sb)
+static inline int f2fs_sb_mounted_blkzoned(struct super_block *sb)
+{
+ return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_BLKZONED);
+}
+
+#ifdef CONFIG_BLK_DEV_ZONED
+static inline int get_blkz_type(struct f2fs_sb_info *sbi,
+ struct block_device *bdev, block_t blkaddr)
+{
+ unsigned int zno = blkaddr >> sbi->log_blocks_per_blkz;
+ int i;
+
+ for (i = 0; i < sbi->s_ndevs; i++)
+ if (FDEV(i).bdev == bdev)
+ return FDEV(i).blkz_type[zno];
+ return -EINVAL;
+}
+#endif
+
+static inline bool f2fs_discard_en(struct f2fs_sb_info *sbi)
{
- return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_HMSMR);
+ struct request_queue *q = bdev_get_queue(sbi->sb->s_bdev);
+
+ return blk_queue_discard(q) || f2fs_sb_mounted_blkzoned(sbi->sb);
}
static inline void set_opt_mode(struct f2fs_sb_info *sbi, unsigned int mt)
@@ -2453,8 +2520,8 @@ static inline bool f2fs_may_encrypt(struct inode *inode)
#define fscrypt_pullback_bio_page fscrypt_notsupp_pullback_bio_page
#define fscrypt_restore_control_page fscrypt_notsupp_restore_control_page
#define fscrypt_zeroout_range fscrypt_notsupp_zeroout_range
-#define fscrypt_process_policy fscrypt_notsupp_process_policy
-#define fscrypt_get_policy fscrypt_notsupp_get_policy
+#define fscrypt_ioctl_set_policy fscrypt_notsupp_ioctl_set_policy
+#define fscrypt_ioctl_get_policy fscrypt_notsupp_ioctl_get_policy
#define fscrypt_has_permitted_context fscrypt_notsupp_has_permitted_context
#define fscrypt_inherit_context fscrypt_notsupp_inherit_context
#define fscrypt_get_encryption_info fscrypt_notsupp_get_encryption_info
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index c7865073cd26..49f10dce817d 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -94,8 +94,6 @@ mapped:
if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
f2fs_wait_on_encrypted_page_writeback(sbi, dn.data_blkaddr);
- /* if gced page is attached, don't write to cold segment */
- clear_cold_data(page);
out:
sb_end_pagefault(inode->i_sb);
f2fs_update_time(sbi, REQ_TIME);
@@ -210,7 +208,7 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end,
}
/* if the inode is dirty, let's recover all the time */
- if (!datasync && !f2fs_skip_inode_update(inode)) {
+ if (!f2fs_skip_inode_update(inode, datasync)) {
f2fs_write_inode(inode, NULL);
goto go_write;
}
@@ -264,7 +262,7 @@ sync_nodes:
}
if (need_inode_block_update(sbi, ino)) {
- f2fs_mark_inode_dirty_sync(inode);
+ f2fs_mark_inode_dirty_sync(inode, true);
f2fs_write_inode(inode, NULL);
goto sync_nodes;
}
@@ -632,7 +630,7 @@ int f2fs_truncate(struct inode *inode)
return err;
inode->i_mtime = inode->i_ctime = current_time(inode);
- f2fs_mark_inode_dirty_sync(inode);
+ f2fs_mark_inode_dirty_sync(inode, false);
return 0;
}
@@ -679,6 +677,7 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
{
struct inode *inode = d_inode(dentry);
int err;
+ bool size_changed = false;
err = setattr_prepare(dentry, attr);
if (err)
@@ -694,7 +693,6 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
err = f2fs_truncate(inode);
if (err)
return err;
- f2fs_balance_fs(F2FS_I_SB(inode), true);
} else {
/*
* do not trim all blocks after i_size if target size is
@@ -710,6 +708,8 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
}
inode->i_mtime = inode->i_ctime = current_time(inode);
}
+
+ size_changed = true;
}
__setattr_copy(inode, attr);
@@ -722,7 +722,12 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
}
}
- f2fs_mark_inode_dirty_sync(inode);
+ /* file size may changed here */
+ f2fs_mark_inode_dirty_sync(inode, size_changed);
+
+ /* inode change will produce dirty node pages flushed by checkpoint */
+ f2fs_balance_fs(F2FS_I_SB(inode), true);
+
return err;
}
@@ -967,7 +972,7 @@ static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode,
new_size = (dst + i) << PAGE_SHIFT;
if (dst_inode->i_size < new_size)
f2fs_i_size_write(dst_inode, new_size);
- } while ((do_replace[i] || blkaddr[i] == NULL_ADDR) && --ilen);
+ } while (--ilen && (do_replace[i] || blkaddr[i] == NULL_ADDR));
f2fs_put_dnode(&dn);
} else {
@@ -1218,6 +1223,9 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
ret = f2fs_do_zero_range(&dn, index, end);
f2fs_put_dnode(&dn);
f2fs_unlock_op(sbi);
+
+ f2fs_balance_fs(sbi, dn.node_changed);
+
if (ret)
goto out;
@@ -1313,15 +1321,15 @@ static int expand_inode_data(struct inode *inode, loff_t offset,
pgoff_t pg_end;
loff_t new_size = i_size_read(inode);
loff_t off_end;
- int ret;
+ int err;
- ret = inode_newsize_ok(inode, (len + offset));
- if (ret)
- return ret;
+ err = inode_newsize_ok(inode, (len + offset));
+ if (err)
+ return err;
- ret = f2fs_convert_inline_inode(inode);
- if (ret)
- return ret;
+ err = f2fs_convert_inline_inode(inode);
+ if (err)
+ return err;
f2fs_balance_fs(sbi, true);
@@ -1333,12 +1341,12 @@ static int expand_inode_data(struct inode *inode, loff_t offset,
if (off_end)
map.m_len++;
- ret = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO);
- if (ret) {
+ err = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO);
+ if (err) {
pgoff_t last_off;
if (!map.m_len)
- return ret;
+ return err;
last_off = map.m_lblk + map.m_len - 1;
@@ -1352,7 +1360,7 @@ static int expand_inode_data(struct inode *inode, loff_t offset,
if (!(mode & FALLOC_FL_KEEP_SIZE) && i_size_read(inode) < new_size)
f2fs_i_size_write(inode, new_size);
- return ret;
+ return err;
}
static long f2fs_fallocate(struct file *file, int mode,
@@ -1393,7 +1401,9 @@ static long f2fs_fallocate(struct file *file, int mode,
if (!ret) {
inode->i_mtime = inode->i_ctime = current_time(inode);
- f2fs_mark_inode_dirty_sync(inode);
+ f2fs_mark_inode_dirty_sync(inode, false);
+ if (mode & FALLOC_FL_KEEP_SIZE)
+ file_set_keep_isize(inode);
f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
}
@@ -1526,7 +1536,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp)
goto out;
f2fs_msg(F2FS_I_SB(inode)->sb, KERN_WARNING,
- "Unexpected flush for atomic writes: ino=%lu, npages=%lld",
+ "Unexpected flush for atomic writes: ino=%lu, npages=%u",
inode->i_ino, get_dirty_pages(inode));
ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX);
if (ret)
@@ -1752,31 +1762,16 @@ static bool uuid_is_nonzero(__u8 u[16])
static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg)
{
- struct fscrypt_policy policy;
struct inode *inode = file_inode(filp);
- if (copy_from_user(&policy, (struct fscrypt_policy __user *)arg,
- sizeof(policy)))
- return -EFAULT;
-
f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
- return fscrypt_process_policy(filp, &policy);
+ return fscrypt_ioctl_set_policy(filp, (const void __user *)arg);
}
static int f2fs_ioc_get_encryption_policy(struct file *filp, unsigned long arg)
{
- struct fscrypt_policy policy;
- struct inode *inode = file_inode(filp);
- int err;
-
- err = fscrypt_get_policy(inode, &policy);
- if (err)
- return err;
-
- if (copy_to_user((struct fscrypt_policy __user *)arg, &policy, sizeof(policy)))
- return -EFAULT;
- return 0;
+ return fscrypt_ioctl_get_policy(filp, (void __user *)arg);
}
static int f2fs_ioc_get_encryption_pwsalt(struct file *filp, unsigned long arg)
@@ -1842,7 +1837,7 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg)
mutex_lock(&sbi->gc_mutex);
}
- ret = f2fs_gc(sbi, sync);
+ ret = f2fs_gc(sbi, sync, true);
out:
mnt_drop_write_file(filp);
return ret;
@@ -2256,12 +2251,15 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
inode_lock(inode);
ret = generic_write_checks(iocb, from);
if (ret > 0) {
- ret = f2fs_preallocate_blocks(iocb, from);
- if (!ret) {
- blk_start_plug(&plug);
- ret = __generic_file_write_iter(iocb, from);
- blk_finish_plug(&plug);
+ int err = f2fs_preallocate_blocks(iocb, from);
+
+ if (err) {
+ inode_unlock(inode);
+ return err;
}
+ blk_start_plug(&plug);
+ ret = __generic_file_write_iter(iocb, from);
+ blk_finish_plug(&plug);
}
inode_unlock(inode);
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 6f14ee923acd..88bfc3dff496 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -82,7 +82,7 @@ static int gc_thread_func(void *data)
stat_inc_bggc_count(sbi);
/* if return value is not zero, no victim was selected */
- if (f2fs_gc(sbi, test_opt(sbi, FORCE_FG_GC)))
+ if (f2fs_gc(sbi, test_opt(sbi, FORCE_FG_GC), true))
wait_ms = gc_th->no_gc_sleep_time;
trace_f2fs_background_gc(sbi->sb, wait_ms,
@@ -544,13 +544,14 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
return true;
}
-static void move_encrypted_block(struct inode *inode, block_t bidx)
+static void move_encrypted_block(struct inode *inode, block_t bidx,
+ unsigned int segno, int off)
{
struct f2fs_io_info fio = {
.sbi = F2FS_I_SB(inode),
.type = DATA,
.op = REQ_OP_READ,
- .op_flags = READ_SYNC,
+ .op_flags = 0,
.encrypted_page = NULL,
};
struct dnode_of_data dn;
@@ -565,6 +566,9 @@ static void move_encrypted_block(struct inode *inode, block_t bidx)
if (!page)
return;
+ if (!check_valid_map(F2FS_I_SB(inode), segno, off))
+ goto out;
+
set_new_dnode(&dn, inode, NULL, NULL, 0);
err = get_dnode_of_data(&dn, bidx, LOOKUP_NODE);
if (err)
@@ -625,7 +629,7 @@ static void move_encrypted_block(struct inode *inode, block_t bidx)
f2fs_wait_on_page_writeback(dn.node_page, NODE, true);
fio.op = REQ_OP_WRITE;
- fio.op_flags = WRITE_SYNC;
+ fio.op_flags = REQ_SYNC;
fio.new_blkaddr = newaddr;
f2fs_submit_page_mbio(&fio);
@@ -645,7 +649,8 @@ out:
f2fs_put_page(page, 1);
}
-static void move_data_page(struct inode *inode, block_t bidx, int gc_type)
+static void move_data_page(struct inode *inode, block_t bidx, int gc_type,
+ unsigned int segno, int off)
{
struct page *page;
@@ -653,6 +658,9 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type)
if (IS_ERR(page))
return;
+ if (!check_valid_map(F2FS_I_SB(inode), segno, off))
+ goto out;
+
if (gc_type == BG_GC) {
if (PageWriteback(page))
goto out;
@@ -663,7 +671,7 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type)
.sbi = F2FS_I_SB(inode),
.type = DATA,
.op = REQ_OP_WRITE,
- .op_flags = WRITE_SYNC,
+ .op_flags = REQ_SYNC,
.page = page,
.encrypted_page = NULL,
};
@@ -673,8 +681,10 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type)
retry:
set_page_dirty(page);
f2fs_wait_on_page_writeback(page, DATA, true);
- if (clear_page_dirty_for_io(page))
+ if (clear_page_dirty_for_io(page)) {
inode_dec_dirty_pages(inode);
+ remove_dirty_inode(inode);
+ }
set_cold_data(page);
@@ -683,8 +693,6 @@ retry:
congestion_wait(BLK_RW_ASYNC, HZ/50);
goto retry;
}
-
- clear_cold_data(page);
}
out:
f2fs_put_page(page, 1);
@@ -794,9 +802,9 @@ next_step:
start_bidx = start_bidx_of_node(nofs, inode)
+ ofs_in_node;
if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
- move_encrypted_block(inode, start_bidx);
+ move_encrypted_block(inode, start_bidx, segno, off);
else
- move_data_page(inode, start_bidx, gc_type);
+ move_data_page(inode, start_bidx, gc_type, segno, off);
if (locked) {
up_write(&fi->dio_rwsem[WRITE]);
@@ -899,7 +907,7 @@ next:
return sec_freed;
}
-int f2fs_gc(struct f2fs_sb_info *sbi, bool sync)
+int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background)
{
unsigned int segno;
int gc_type = sync ? FG_GC : BG_GC;
@@ -940,6 +948,9 @@ gc_more:
if (ret)
goto stop;
}
+ } else if (gc_type == BG_GC && !background) {
+ /* f2fs_balance_fs doesn't need to do BG_GC in critical path. */
+ goto stop;
}
if (segno == NULL_SEGNO && !__get_victim(sbi, &segno, gc_type))
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index 5f1a67f756af..e32a9e527968 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -111,7 +111,7 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page)
.sbi = F2FS_I_SB(dn->inode),
.type = DATA,
.op = REQ_OP_WRITE,
- .op_flags = WRITE_SYNC | REQ_PRIO,
+ .op_flags = REQ_SYNC | REQ_PRIO,
.page = page,
.encrypted_page = NULL,
};
@@ -137,8 +137,10 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page)
fio.old_blkaddr = dn->data_blkaddr;
write_data_page(dn, &fio);
f2fs_wait_on_page_writeback(page, DATA, true);
- if (dirty)
+ if (dirty) {
inode_dec_dirty_pages(dn->inode);
+ remove_dirty_inode(dn->inode);
+ }
/* this converted inline_data should be recovered. */
set_inode_flag(dn->inode, FI_APPEND_WRITE);
@@ -419,7 +421,7 @@ static int f2fs_add_inline_entries(struct inode *dir,
}
new_name.name = d.filename[bit_pos];
- new_name.len = de->name_len;
+ new_name.len = le16_to_cpu(de->name_len);
ino = le32_to_cpu(de->ino);
fake_mode = get_de_type(de) << S_SHIFT;
@@ -573,7 +575,7 @@ void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page,
f2fs_put_page(page, 1);
dir->i_ctime = dir->i_mtime = current_time(dir);
- f2fs_mark_inode_dirty_sync(dir);
+ f2fs_mark_inode_dirty_sync(dir, false);
if (inode)
f2fs_drop_nlink(dir, inode);
@@ -610,6 +612,7 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx,
struct f2fs_inline_dentry *inline_dentry = NULL;
struct page *ipage = NULL;
struct f2fs_dentry_ptr d;
+ int err;
if (ctx->pos == NR_INLINE_DENTRY)
return 0;
@@ -622,11 +625,12 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx,
make_dentry_ptr(inode, &d, (void *)inline_dentry, 2);
- if (!f2fs_fill_dentries(ctx, &d, 0, fstr))
+ err = f2fs_fill_dentries(ctx, &d, 0, fstr);
+ if (!err)
ctx->pos = NR_INLINE_DENTRY;
f2fs_put_page(ipage, 1);
- return 0;
+ return err < 0 ? err : 0;
}
int f2fs_inline_data_fiemap(struct inode *inode,
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index d7369895a78a..af06bda51a54 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -19,10 +19,11 @@
#include <trace/events/f2fs.h>
-void f2fs_mark_inode_dirty_sync(struct inode *inode)
+void f2fs_mark_inode_dirty_sync(struct inode *inode, bool sync)
{
- if (f2fs_inode_dirtied(inode))
+ if (f2fs_inode_dirtied(inode, sync))
return;
+
mark_inode_dirty_sync(inode);
}
@@ -43,7 +44,7 @@ void f2fs_set_inode_flags(struct inode *inode)
new_fl |= S_DIRSYNC;
inode_set_flags(inode, new_fl,
S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
- f2fs_mark_inode_dirty_sync(inode);
+ f2fs_mark_inode_dirty_sync(inode, false);
}
static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri)
@@ -252,6 +253,7 @@ retry:
int update_inode(struct inode *inode, struct page *node_page)
{
struct f2fs_inode *ri;
+ struct extent_tree *et = F2FS_I(inode)->extent_tree;
f2fs_inode_synced(inode);
@@ -267,11 +269,13 @@ int update_inode(struct inode *inode, struct page *node_page)
ri->i_size = cpu_to_le64(i_size_read(inode));
ri->i_blocks = cpu_to_le64(inode->i_blocks);
- if (F2FS_I(inode)->extent_tree)
- set_raw_extent(&F2FS_I(inode)->extent_tree->largest,
- &ri->i_ext);
- else
+ if (et) {
+ read_lock(&et->lock);
+ set_raw_extent(&et->largest, &ri->i_ext);
+ read_unlock(&et->lock);
+ } else {
memset(&ri->i_ext, 0, sizeof(ri->i_ext));
+ }
set_raw_inline(inode, ri);
ri->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
@@ -335,7 +339,7 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
* We need to balance fs here to prevent from producing dirty node pages
* during the urgent cleaning time when runing out of free sections.
*/
- if (update_inode_page(inode))
+ if (update_inode_page(inode) && wbc && wbc->nr_to_write)
f2fs_balance_fs(sbi, true);
return 0;
}
@@ -373,6 +377,9 @@ void f2fs_evict_inode(struct inode *inode)
goto no_delete;
#endif
+ remove_ino_entry(sbi, inode->i_ino, APPEND_INO);
+ remove_ino_entry(sbi, inode->i_ino, UPDATE_INO);
+
sb_start_intwrite(inode->i_sb);
set_inode_flag(inode, FI_NO_ALLOC);
i_size_write(inode, 0);
@@ -384,6 +391,8 @@ retry:
f2fs_lock_op(sbi);
err = remove_inode_page(inode);
f2fs_unlock_op(sbi);
+ if (err == -ENOENT)
+ err = 0;
}
/* give more chances, if ENOMEM case */
@@ -403,10 +412,12 @@ no_delete:
invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino, inode->i_ino);
if (xnid)
invalidate_mapping_pages(NODE_MAPPING(sbi), xnid, xnid);
- if (is_inode_flag_set(inode, FI_APPEND_WRITE))
- add_ino_entry(sbi, inode->i_ino, APPEND_INO);
- if (is_inode_flag_set(inode, FI_UPDATE_WRITE))
- add_ino_entry(sbi, inode->i_ino, UPDATE_INO);
+ if (inode->i_nlink) {
+ if (is_inode_flag_set(inode, FI_APPEND_WRITE))
+ add_ino_entry(sbi, inode->i_ino, APPEND_INO);
+ if (is_inode_flag_set(inode, FI_UPDATE_WRITE))
+ add_ino_entry(sbi, inode->i_ino, UPDATE_INO);
+ }
if (is_inode_flag_set(inode, FI_FREE_NID)) {
alloc_nid_failed(sbi, inode->i_ino);
clear_inode_flag(inode, FI_FREE_NID);
@@ -424,6 +435,18 @@ void handle_failed_inode(struct inode *inode)
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct node_info ni;
+ /*
+ * clear nlink of inode in order to release resource of inode
+ * immediately.
+ */
+ clear_nlink(inode);
+
+ /*
+ * we must call this to avoid inode being remained as dirty, resulting
+ * in a panic when flushing dirty inodes in gdirty_list.
+ */
+ update_inode_page(inode);
+
/* don't make bad inode, since it becomes a regular file. */
unlock_new_inode(inode);
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 489fa0d5f914..db33b5631dc8 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -778,7 +778,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
up_write(&F2FS_I(old_inode)->i_sem);
old_inode->i_ctime = current_time(old_inode);
- f2fs_mark_inode_dirty_sync(old_inode);
+ f2fs_mark_inode_dirty_sync(old_inode, false);
f2fs_delete_entry(old_entry, old_page, old_dir, NULL);
@@ -938,7 +938,7 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
f2fs_i_links_write(old_dir, old_nlink > 0);
up_write(&F2FS_I(old_dir)->i_sem);
}
- f2fs_mark_inode_dirty_sync(old_dir);
+ f2fs_mark_inode_dirty_sync(old_dir, false);
/* update directory entry info of new dir inode */
f2fs_set_link(new_dir, new_entry, new_page, old_inode);
@@ -953,7 +953,7 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
f2fs_i_links_write(new_dir, new_nlink > 0);
up_write(&F2FS_I(new_dir)->i_sem);
}
- f2fs_mark_inode_dirty_sync(new_dir);
+ f2fs_mark_inode_dirty_sync(new_dir, false);
f2fs_unlock_op(sbi);
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 01177ecdeab8..b9078fdb3743 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -45,8 +45,8 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type)
* give 25%, 25%, 50%, 50%, 50% memory for each components respectively
*/
if (type == FREE_NIDS) {
- mem_size = (nm_i->fcnt * sizeof(struct free_nid)) >>
- PAGE_SHIFT;
+ mem_size = (nm_i->nid_cnt[FREE_NID_LIST] *
+ sizeof(struct free_nid)) >> PAGE_SHIFT;
res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2);
} else if (type == NAT_ENTRIES) {
mem_size = (nm_i->nat_cnt * sizeof(struct nat_entry)) >>
@@ -270,8 +270,9 @@ static void cache_nat_entry(struct f2fs_sb_info *sbi, nid_t nid,
e = grab_nat_entry(nm_i, nid);
node_info_from_raw_nat(&e->ni, ne);
} else {
- f2fs_bug_on(sbi, nat_get_ino(e) != ne->ino ||
- nat_get_blkaddr(e) != ne->block_addr ||
+ f2fs_bug_on(sbi, nat_get_ino(e) != le32_to_cpu(ne->ino) ||
+ nat_get_blkaddr(e) !=
+ le32_to_cpu(ne->block_addr) ||
nat_get_version(e) != ne->version);
}
}
@@ -1134,7 +1135,7 @@ repeat:
if (!page)
return ERR_PTR(-ENOMEM);
- err = read_node_page(page, READ_SYNC);
+ err = read_node_page(page, 0);
if (err < 0) {
f2fs_put_page(page, 1);
return ERR_PTR(err);
@@ -1204,6 +1205,7 @@ static void flush_inline_data(struct f2fs_sb_info *sbi, nid_t ino)
ret = f2fs_write_inline_data(inode, page);
inode_dec_dirty_pages(inode);
+ remove_dirty_inode(inode);
if (ret)
set_page_dirty(page);
page_out:
@@ -1338,7 +1340,8 @@ retry:
if (unlikely(f2fs_cp_error(sbi))) {
f2fs_put_page(last_page, 0);
pagevec_release(&pvec);
- return -EIO;
+ ret = -EIO;
+ goto out;
}
if (!IS_DNODE(page) || !is_cold_node(page))
@@ -1407,11 +1410,12 @@ continue_unlock:
"Retry to write fsync mark: ino=%u, idx=%lx",
ino, last_page->index);
lock_page(last_page);
+ f2fs_wait_on_page_writeback(last_page, NODE, true);
set_page_dirty(last_page);
unlock_page(last_page);
goto retry;
}
-
+out:
if (nwritten)
f2fs_submit_merged_bio_cond(sbi, NULL, NULL, ino, NODE, WRITE);
return ret ? -EIO: 0;
@@ -1570,7 +1574,7 @@ static int f2fs_write_node_page(struct page *page,
.sbi = sbi,
.type = NODE,
.op = REQ_OP_WRITE,
- .op_flags = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : 0,
+ .op_flags = wbc_to_write_flags(wbc),
.page = page,
.encrypted_page = NULL,
};
@@ -1692,11 +1696,35 @@ static struct free_nid *__lookup_free_nid_list(struct f2fs_nm_info *nm_i,
return radix_tree_lookup(&nm_i->free_nid_root, n);
}
-static void __del_from_free_nid_list(struct f2fs_nm_info *nm_i,
- struct free_nid *i)
+static int __insert_nid_to_list(struct f2fs_sb_info *sbi,
+ struct free_nid *i, enum nid_list list, bool new)
{
+ struct f2fs_nm_info *nm_i = NM_I(sbi);
+
+ if (new) {
+ int err = radix_tree_insert(&nm_i->free_nid_root, i->nid, i);
+ if (err)
+ return err;
+ }
+
+ f2fs_bug_on(sbi, list == FREE_NID_LIST ? i->state != NID_NEW :
+ i->state != NID_ALLOC);
+ nm_i->nid_cnt[list]++;
+ list_add_tail(&i->list, &nm_i->nid_list[list]);
+ return 0;
+}
+
+static void __remove_nid_from_list(struct f2fs_sb_info *sbi,
+ struct free_nid *i, enum nid_list list, bool reuse)
+{
+ struct f2fs_nm_info *nm_i = NM_I(sbi);
+
+ f2fs_bug_on(sbi, list == FREE_NID_LIST ? i->state != NID_NEW :
+ i->state != NID_ALLOC);
+ nm_i->nid_cnt[list]--;
list_del(&i->list);
- radix_tree_delete(&nm_i->free_nid_root, i->nid);
+ if (!reuse)
+ radix_tree_delete(&nm_i->free_nid_root, i->nid);
}
static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build)
@@ -1704,9 +1732,7 @@ static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build)
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct free_nid *i;
struct nat_entry *ne;
-
- if (!available_free_memory(sbi, FREE_NIDS))
- return -1;
+ int err;
/* 0 nid should not be used */
if (unlikely(nid == 0))
@@ -1729,33 +1755,30 @@ static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build)
return 0;
}
- spin_lock(&nm_i->free_nid_list_lock);
- if (radix_tree_insert(&nm_i->free_nid_root, i->nid, i)) {
- spin_unlock(&nm_i->free_nid_list_lock);
- radix_tree_preload_end();
+ spin_lock(&nm_i->nid_list_lock);
+ err = __insert_nid_to_list(sbi, i, FREE_NID_LIST, true);
+ spin_unlock(&nm_i->nid_list_lock);
+ radix_tree_preload_end();
+ if (err) {
kmem_cache_free(free_nid_slab, i);
return 0;
}
- list_add_tail(&i->list, &nm_i->free_nid_list);
- nm_i->fcnt++;
- spin_unlock(&nm_i->free_nid_list_lock);
- radix_tree_preload_end();
return 1;
}
-static void remove_free_nid(struct f2fs_nm_info *nm_i, nid_t nid)
+static void remove_free_nid(struct f2fs_sb_info *sbi, nid_t nid)
{
+ struct f2fs_nm_info *nm_i = NM_I(sbi);
struct free_nid *i;
bool need_free = false;
- spin_lock(&nm_i->free_nid_list_lock);
+ spin_lock(&nm_i->nid_list_lock);
i = __lookup_free_nid_list(nm_i, nid);
if (i && i->state == NID_NEW) {
- __del_from_free_nid_list(nm_i, i);
- nm_i->fcnt--;
+ __remove_nid_from_list(sbi, i, FREE_NID_LIST, false);
need_free = true;
}
- spin_unlock(&nm_i->free_nid_list_lock);
+ spin_unlock(&nm_i->nid_list_lock);
if (need_free)
kmem_cache_free(free_nid_slab, i);
@@ -1778,14 +1801,12 @@ static void scan_nat_page(struct f2fs_sb_info *sbi,
blk_addr = le32_to_cpu(nat_blk->entries[i].block_addr);
f2fs_bug_on(sbi, blk_addr == NEW_ADDR);
- if (blk_addr == NULL_ADDR) {
- if (add_free_nid(sbi, start_nid, true) < 0)
- break;
- }
+ if (blk_addr == NULL_ADDR)
+ add_free_nid(sbi, start_nid, true);
}
}
-void build_free_nids(struct f2fs_sb_info *sbi)
+static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
@@ -1794,7 +1815,10 @@ void build_free_nids(struct f2fs_sb_info *sbi)
nid_t nid = nm_i->next_scan_nid;
/* Enough entries */
- if (nm_i->fcnt >= NAT_ENTRY_PER_BLOCK)
+ if (nm_i->nid_cnt[FREE_NID_LIST] >= NAT_ENTRY_PER_BLOCK)
+ return;
+
+ if (!sync && !available_free_memory(sbi, FREE_NIDS))
return;
/* readahead nat pages to be scanned */
@@ -1830,7 +1854,7 @@ void build_free_nids(struct f2fs_sb_info *sbi)
if (addr == NULL_ADDR)
add_free_nid(sbi, nid, true);
else
- remove_free_nid(nm_i, nid);
+ remove_free_nid(sbi, nid);
}
up_read(&curseg->journal_rwsem);
up_read(&nm_i->nat_tree_lock);
@@ -1839,6 +1863,13 @@ void build_free_nids(struct f2fs_sb_info *sbi)
nm_i->ra_nid_pages, META_NAT, false);
}
+void build_free_nids(struct f2fs_sb_info *sbi, bool sync)
+{
+ mutex_lock(&NM_I(sbi)->build_lock);
+ __build_free_nids(sbi, sync);
+ mutex_unlock(&NM_I(sbi)->build_lock);
+}
+
/*
* If this function returns success, caller can obtain a new nid
* from second parameter of this function.
@@ -1853,31 +1884,31 @@ retry:
if (time_to_inject(sbi, FAULT_ALLOC_NID))
return false;
#endif
- if (unlikely(sbi->total_valid_node_count + 1 > nm_i->available_nids))
- return false;
+ spin_lock(&nm_i->nid_list_lock);
- spin_lock(&nm_i->free_nid_list_lock);
+ if (unlikely(nm_i->available_nids == 0)) {
+ spin_unlock(&nm_i->nid_list_lock);
+ return false;
+ }
/* We should not use stale free nids created by build_free_nids */
- if (nm_i->fcnt && !on_build_free_nids(nm_i)) {
- f2fs_bug_on(sbi, list_empty(&nm_i->free_nid_list));
- list_for_each_entry(i, &nm_i->free_nid_list, list)
- if (i->state == NID_NEW)
- break;
-
- f2fs_bug_on(sbi, i->state != NID_NEW);
+ if (nm_i->nid_cnt[FREE_NID_LIST] && !on_build_free_nids(nm_i)) {
+ f2fs_bug_on(sbi, list_empty(&nm_i->nid_list[FREE_NID_LIST]));
+ i = list_first_entry(&nm_i->nid_list[FREE_NID_LIST],
+ struct free_nid, list);
*nid = i->nid;
+
+ __remove_nid_from_list(sbi, i, FREE_NID_LIST, true);
i->state = NID_ALLOC;
- nm_i->fcnt--;
- spin_unlock(&nm_i->free_nid_list_lock);
+ __insert_nid_to_list(sbi, i, ALLOC_NID_LIST, false);
+ nm_i->available_nids--;
+ spin_unlock(&nm_i->nid_list_lock);
return true;
}
- spin_unlock(&nm_i->free_nid_list_lock);
+ spin_unlock(&nm_i->nid_list_lock);
/* Let's scan nat pages and its caches to get free nids */
- mutex_lock(&nm_i->build_lock);
- build_free_nids(sbi);
- mutex_unlock(&nm_i->build_lock);
+ build_free_nids(sbi, true);
goto retry;
}
@@ -1889,11 +1920,11 @@ void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid)
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct free_nid *i;
- spin_lock(&nm_i->free_nid_list_lock);
+ spin_lock(&nm_i->nid_list_lock);
i = __lookup_free_nid_list(nm_i, nid);
- f2fs_bug_on(sbi, !i || i->state != NID_ALLOC);
- __del_from_free_nid_list(nm_i, i);
- spin_unlock(&nm_i->free_nid_list_lock);
+ f2fs_bug_on(sbi, !i);
+ __remove_nid_from_list(sbi, i, ALLOC_NID_LIST, false);
+ spin_unlock(&nm_i->nid_list_lock);
kmem_cache_free(free_nid_slab, i);
}
@@ -1910,17 +1941,22 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid)
if (!nid)
return;
- spin_lock(&nm_i->free_nid_list_lock);
+ spin_lock(&nm_i->nid_list_lock);
i = __lookup_free_nid_list(nm_i, nid);
- f2fs_bug_on(sbi, !i || i->state != NID_ALLOC);
+ f2fs_bug_on(sbi, !i);
+
if (!available_free_memory(sbi, FREE_NIDS)) {
- __del_from_free_nid_list(nm_i, i);
+ __remove_nid_from_list(sbi, i, ALLOC_NID_LIST, false);
need_free = true;
} else {
+ __remove_nid_from_list(sbi, i, ALLOC_NID_LIST, true);
i->state = NID_NEW;
- nm_i->fcnt++;
+ __insert_nid_to_list(sbi, i, FREE_NID_LIST, false);
}
- spin_unlock(&nm_i->free_nid_list_lock);
+
+ nm_i->available_nids++;
+
+ spin_unlock(&nm_i->nid_list_lock);
if (need_free)
kmem_cache_free(free_nid_slab, i);
@@ -1932,24 +1968,24 @@ int try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink)
struct free_nid *i, *next;
int nr = nr_shrink;
- if (nm_i->fcnt <= MAX_FREE_NIDS)
+ if (nm_i->nid_cnt[FREE_NID_LIST] <= MAX_FREE_NIDS)
return 0;
if (!mutex_trylock(&nm_i->build_lock))
return 0;
- spin_lock(&nm_i->free_nid_list_lock);
- list_for_each_entry_safe(i, next, &nm_i->free_nid_list, list) {
- if (nr_shrink <= 0 || nm_i->fcnt <= MAX_FREE_NIDS)
+ spin_lock(&nm_i->nid_list_lock);
+ list_for_each_entry_safe(i, next, &nm_i->nid_list[FREE_NID_LIST],
+ list) {
+ if (nr_shrink <= 0 ||
+ nm_i->nid_cnt[FREE_NID_LIST] <= MAX_FREE_NIDS)
break;
- if (i->state == NID_ALLOC)
- continue;
- __del_from_free_nid_list(nm_i, i);
+
+ __remove_nid_from_list(sbi, i, FREE_NID_LIST, false);
kmem_cache_free(free_nid_slab, i);
- nm_i->fcnt--;
nr_shrink--;
}
- spin_unlock(&nm_i->free_nid_list_lock);
+ spin_unlock(&nm_i->nid_list_lock);
mutex_unlock(&nm_i->build_lock);
return nr - nr_shrink;
@@ -2005,7 +2041,7 @@ recover_xnid:
if (unlikely(!inc_valid_node_count(sbi, inode)))
f2fs_bug_on(sbi, 1);
- remove_free_nid(NM_I(sbi), new_xnid);
+ remove_free_nid(sbi, new_xnid);
get_node_info(sbi, new_xnid, &ni);
ni.ino = inode->i_ino;
set_node_addr(sbi, &ni, NEW_ADDR, false);
@@ -2035,7 +2071,7 @@ retry:
}
/* Should not use this inode from free nid list */
- remove_free_nid(NM_I(sbi), ino);
+ remove_free_nid(sbi, ino);
if (!PageUptodate(ipage))
SetPageUptodate(ipage);
@@ -2069,7 +2105,6 @@ int restore_node_summary(struct f2fs_sb_info *sbi,
struct f2fs_node *rn;
struct f2fs_summary *sum_entry;
block_t addr;
- int bio_blocks = MAX_BIO_BLOCKS(sbi);
int i, idx, last_offset, nrpages;
/* scan the node segment */
@@ -2078,7 +2113,7 @@ int restore_node_summary(struct f2fs_sb_info *sbi,
sum_entry = &sum->entries[0];
for (i = 0; i < last_offset; i += nrpages, addr += nrpages) {
- nrpages = min(last_offset - i, bio_blocks);
+ nrpages = min(last_offset - i, BIO_MAX_PAGES);
/* readahead node pages */
ra_meta_pages(sbi, addr, nrpages, META_POR, true);
@@ -2120,6 +2155,19 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi)
ne = grab_nat_entry(nm_i, nid);
node_info_from_raw_nat(&ne->ni, &raw_ne);
}
+
+ /*
+ * if a free nat in journal has not been used after last
+ * checkpoint, we should remove it from available nids,
+ * since later we will add it again.
+ */
+ if (!get_nat_flag(ne, IS_DIRTY) &&
+ le32_to_cpu(raw_ne.block_addr) == NULL_ADDR) {
+ spin_lock(&nm_i->nid_list_lock);
+ nm_i->available_nids--;
+ spin_unlock(&nm_i->nid_list_lock);
+ }
+
__set_nat_cache_dirty(nm_i, ne);
}
update_nats_in_cursum(journal, -i);
@@ -2192,8 +2240,12 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi,
raw_nat_from_node_info(raw_ne, &ne->ni);
nat_reset_flag(ne);
__clear_nat_cache_dirty(NM_I(sbi), ne);
- if (nat_get_blkaddr(ne) == NULL_ADDR)
+ if (nat_get_blkaddr(ne) == NULL_ADDR) {
add_free_nid(sbi, nid, false);
+ spin_lock(&NM_I(sbi)->nid_list_lock);
+ NM_I(sbi)->available_nids++;
+ spin_unlock(&NM_I(sbi)->nid_list_lock);
+ }
}
if (to_journal)
@@ -2268,21 +2320,24 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nat_blocks;
/* not used nids: 0, node, meta, (and root counted as valid node) */
- nm_i->available_nids = nm_i->max_nid - F2FS_RESERVED_NODE_NUM;
- nm_i->fcnt = 0;
+ nm_i->available_nids = nm_i->max_nid - sbi->total_valid_node_count -
+ F2FS_RESERVED_NODE_NUM;
+ nm_i->nid_cnt[FREE_NID_LIST] = 0;
+ nm_i->nid_cnt[ALLOC_NID_LIST] = 0;
nm_i->nat_cnt = 0;
nm_i->ram_thresh = DEF_RAM_THRESHOLD;
nm_i->ra_nid_pages = DEF_RA_NID_PAGES;
nm_i->dirty_nats_ratio = DEF_DIRTY_NAT_RATIO_THRESHOLD;
INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC);
- INIT_LIST_HEAD(&nm_i->free_nid_list);
+ INIT_LIST_HEAD(&nm_i->nid_list[FREE_NID_LIST]);
+ INIT_LIST_HEAD(&nm_i->nid_list[ALLOC_NID_LIST]);
INIT_RADIX_TREE(&nm_i->nat_root, GFP_NOIO);
INIT_RADIX_TREE(&nm_i->nat_set_root, GFP_NOIO);
INIT_LIST_HEAD(&nm_i->nat_entries);
mutex_init(&nm_i->build_lock);
- spin_lock_init(&nm_i->free_nid_list_lock);
+ spin_lock_init(&nm_i->nid_list_lock);
init_rwsem(&nm_i->nat_tree_lock);
nm_i->next_scan_nid = le32_to_cpu(sbi->ckpt->next_free_nid);
@@ -2310,7 +2365,7 @@ int build_node_manager(struct f2fs_sb_info *sbi)
if (err)
return err;
- build_free_nids(sbi);
+ build_free_nids(sbi, true);
return 0;
}
@@ -2327,17 +2382,18 @@ void destroy_node_manager(struct f2fs_sb_info *sbi)
return;
/* destroy free nid list */
- spin_lock(&nm_i->free_nid_list_lock);
- list_for_each_entry_safe(i, next_i, &nm_i->free_nid_list, list) {
- f2fs_bug_on(sbi, i->state == NID_ALLOC);
- __del_from_free_nid_list(nm_i, i);
- nm_i->fcnt--;
- spin_unlock(&nm_i->free_nid_list_lock);
+ spin_lock(&nm_i->nid_list_lock);
+ list_for_each_entry_safe(i, next_i, &nm_i->nid_list[FREE_NID_LIST],
+ list) {
+ __remove_nid_from_list(sbi, i, FREE_NID_LIST, false);
+ spin_unlock(&nm_i->nid_list_lock);
kmem_cache_free(free_nid_slab, i);
- spin_lock(&nm_i->free_nid_list_lock);
+ spin_lock(&nm_i->nid_list_lock);
}
- f2fs_bug_on(sbi, nm_i->fcnt);
- spin_unlock(&nm_i->free_nid_list_lock);
+ f2fs_bug_on(sbi, nm_i->nid_cnt[FREE_NID_LIST]);
+ f2fs_bug_on(sbi, nm_i->nid_cnt[ALLOC_NID_LIST]);
+ f2fs_bug_on(sbi, !list_empty(&nm_i->nid_list[ALLOC_NID_LIST]));
+ spin_unlock(&nm_i->nid_list_lock);
/* destroy nat cache */
down_write(&nm_i->nat_tree_lock);
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index 868bec65e51c..e7997e240366 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -169,14 +169,15 @@ static inline void next_free_nid(struct f2fs_sb_info *sbi, nid_t *nid)
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct free_nid *fnid;
- spin_lock(&nm_i->free_nid_list_lock);
- if (nm_i->fcnt <= 0) {
- spin_unlock(&nm_i->free_nid_list_lock);
+ spin_lock(&nm_i->nid_list_lock);
+ if (nm_i->nid_cnt[FREE_NID_LIST] <= 0) {
+ spin_unlock(&nm_i->nid_list_lock);
return;
}
- fnid = list_entry(nm_i->free_nid_list.next, struct free_nid, list);
+ fnid = list_entry(nm_i->nid_list[FREE_NID_LIST].next,
+ struct free_nid, list);
*nid = fnid->nid;
- spin_unlock(&nm_i->free_nid_list_lock);
+ spin_unlock(&nm_i->nid_list_lock);
}
/*
@@ -313,7 +314,7 @@ static inline bool is_recoverable_dnode(struct page *page)
((unsigned char *)ckpt + crc_offset)));
cp_ver |= (crc << 32);
}
- return cpu_to_le64(cp_ver) == cpver_of_node(page);
+ return cp_ver == cpver_of_node(page);
}
/*
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 2fc84a991325..981a9584b62f 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -180,13 +180,15 @@ static void recover_inode(struct inode *inode, struct page *page)
inode->i_mode = le16_to_cpu(raw->i_mode);
f2fs_i_size_write(inode, le64_to_cpu(raw->i_size));
- inode->i_atime.tv_sec = le64_to_cpu(raw->i_mtime);
+ inode->i_atime.tv_sec = le64_to_cpu(raw->i_atime);
inode->i_ctime.tv_sec = le64_to_cpu(raw->i_ctime);
inode->i_mtime.tv_sec = le64_to_cpu(raw->i_mtime);
- inode->i_atime.tv_nsec = le32_to_cpu(raw->i_mtime_nsec);
+ inode->i_atime.tv_nsec = le32_to_cpu(raw->i_atime_nsec);
inode->i_ctime.tv_nsec = le32_to_cpu(raw->i_ctime_nsec);
inode->i_mtime.tv_nsec = le32_to_cpu(raw->i_mtime_nsec);
+ F2FS_I(inode)->i_advise = raw->i_advise;
+
if (file_enc_name(inode))
name = "<encrypted>";
else
@@ -196,32 +198,6 @@ static void recover_inode(struct inode *inode, struct page *page)
ino_of_node(page), name);
}
-static bool is_same_inode(struct inode *inode, struct page *ipage)
-{
- struct f2fs_inode *ri = F2FS_INODE(ipage);
- struct timespec disk;
-
- if (!IS_INODE(ipage))
- return true;
-
- disk.tv_sec = le64_to_cpu(ri->i_ctime);
- disk.tv_nsec = le32_to_cpu(ri->i_ctime_nsec);
- if (timespec_compare(&inode->i_ctime, &disk) > 0)
- return false;
-
- disk.tv_sec = le64_to_cpu(ri->i_atime);
- disk.tv_nsec = le32_to_cpu(ri->i_atime_nsec);
- if (timespec_compare(&inode->i_atime, &disk) > 0)
- return false;
-
- disk.tv_sec = le64_to_cpu(ri->i_mtime);
- disk.tv_nsec = le32_to_cpu(ri->i_mtime_nsec);
- if (timespec_compare(&inode->i_mtime, &disk) > 0)
- return false;
-
- return true;
-}
-
static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
{
struct curseg_info *curseg;
@@ -248,10 +224,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
goto next;
entry = get_fsync_inode(head, ino_of_node(page));
- if (entry) {
- if (!is_same_inode(entry->inode, page))
- goto next;
- } else {
+ if (!entry) {
if (IS_INODE(page) && is_dent_dnode(page)) {
err = recover_inode_page(sbi, page);
if (err)
@@ -454,7 +427,8 @@ retry_dn:
continue;
}
- if ((start + 1) << PAGE_SHIFT > i_size_read(inode))
+ if (!file_keep_isize(inode) &&
+ (i_size_read(inode) <= (start << PAGE_SHIFT)))
f2fs_i_size_write(inode, (start + 1) << PAGE_SHIFT);
/*
@@ -507,8 +481,10 @@ err:
f2fs_put_dnode(&dn);
out:
f2fs_msg(sbi->sb, KERN_NOTICE,
- "recover_data: ino = %lx, recovered = %d blocks, err = %d",
- inode->i_ino, recovered, err);
+ "recover_data: ino = %lx (i_size: %s) recovered = %d, err = %d",
+ inode->i_ino,
+ file_keep_isize(inode) ? "keep" : "recover",
+ recovered, err);
return err;
}
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index fc886f008449..0738f48293cc 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -259,7 +259,7 @@ static int __commit_inmem_pages(struct inode *inode,
.sbi = sbi,
.type = DATA,
.op = REQ_OP_WRITE,
- .op_flags = WRITE_SYNC | REQ_PRIO,
+ .op_flags = REQ_SYNC | REQ_PRIO,
.encrypted_page = NULL,
};
bool submit_bio = false;
@@ -274,8 +274,10 @@ static int __commit_inmem_pages(struct inode *inode,
set_page_dirty(page);
f2fs_wait_on_page_writeback(page, DATA, true);
- if (clear_page_dirty_for_io(page))
+ if (clear_page_dirty_for_io(page)) {
inode_dec_dirty_pages(inode);
+ remove_dirty_inode(inode);
+ }
fio.page = page;
err = do_write_data_page(&fio);
@@ -287,7 +289,6 @@ static int __commit_inmem_pages(struct inode *inode,
/* record old blkaddr for revoking */
cur->old_addr = fio.old_blkaddr;
- clear_cold_data(page);
submit_bio = true;
}
unlock_page(page);
@@ -363,7 +364,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need)
*/
if (has_not_enough_free_secs(sbi, 0, 0)) {
mutex_lock(&sbi->gc_mutex);
- f2fs_gc(sbi, false);
+ f2fs_gc(sbi, false, false);
}
}
@@ -380,14 +381,17 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi)
if (!available_free_memory(sbi, FREE_NIDS))
try_to_free_nids(sbi, MAX_FREE_NIDS);
else
- build_free_nids(sbi);
+ build_free_nids(sbi, false);
+
+ if (!is_idle(sbi))
+ return;
/* checkpoint is the only way to shrink partial cached entries */
if (!available_free_memory(sbi, NAT_ENTRIES) ||
!available_free_memory(sbi, INO_ENTRIES) ||
excess_prefree_segs(sbi) ||
excess_dirty_nats(sbi) ||
- (is_idle(sbi) && f2fs_time_over(sbi, CP_TIME))) {
+ f2fs_time_over(sbi, CP_TIME)) {
if (test_opt(sbi, DATA_FLUSH)) {
struct blk_plug plug;
@@ -400,6 +404,33 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi)
}
}
+static int __submit_flush_wait(struct block_device *bdev)
+{
+ struct bio *bio = f2fs_bio_alloc(0);
+ int ret;
+
+ bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
+ bio->bi_bdev = bdev;
+ ret = submit_bio_wait(bio);
+ bio_put(bio);
+ return ret;
+}
+
+static int submit_flush_wait(struct f2fs_sb_info *sbi)
+{
+ int ret = __submit_flush_wait(sbi->sb->s_bdev);
+ int i;
+
+ if (sbi->s_ndevs && !ret) {
+ for (i = 1; i < sbi->s_ndevs; i++) {
+ ret = __submit_flush_wait(FDEV(i).bdev);
+ if (ret)
+ break;
+ }
+ }
+ return ret;
+}
+
static int issue_flush_thread(void *data)
{
struct f2fs_sb_info *sbi = data;
@@ -410,25 +441,18 @@ repeat:
return 0;
if (!llist_empty(&fcc->issue_list)) {
- struct bio *bio;
struct flush_cmd *cmd, *next;
int ret;
- bio = f2fs_bio_alloc(0);
-
fcc->dispatch_list = llist_del_all(&fcc->issue_list);
fcc->dispatch_list = llist_reverse_order(fcc->dispatch_list);
- bio->bi_bdev = sbi->sb->s_bdev;
- bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH);
- ret = submit_bio_wait(bio);
-
+ ret = submit_flush_wait(sbi);
llist_for_each_entry_safe(cmd, next,
fcc->dispatch_list, llnode) {
cmd->ret = ret;
complete(&cmd->wait);
}
- bio_put(bio);
fcc->dispatch_list = NULL;
}
@@ -449,15 +473,11 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi)
return 0;
if (!test_opt(sbi, FLUSH_MERGE) || !atomic_read(&fcc->submit_flush)) {
- struct bio *bio = f2fs_bio_alloc(0);
int ret;
atomic_inc(&fcc->submit_flush);
- bio->bi_bdev = sbi->sb->s_bdev;
- bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH);
- ret = submit_bio_wait(bio);
+ ret = submit_flush_wait(sbi);
atomic_dec(&fcc->submit_flush);
- bio_put(bio);
return ret;
}
@@ -469,8 +489,13 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi)
if (!fcc->dispatch_list)
wake_up(&fcc->flush_wait_queue);
- wait_for_completion(&cmd.wait);
- atomic_dec(&fcc->submit_flush);
+ if (fcc->f2fs_issue_flush) {
+ wait_for_completion(&cmd.wait);
+ atomic_dec(&fcc->submit_flush);
+ } else {
+ llist_del_all(&fcc->issue_list);
+ atomic_set(&fcc->submit_flush, 0);
+ }
return cmd.ret;
}
@@ -481,6 +506,11 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi)
struct flush_cmd_control *fcc;
int err = 0;
+ if (SM_I(sbi)->cmd_control_info) {
+ fcc = SM_I(sbi)->cmd_control_info;
+ goto init_thread;
+ }
+
fcc = kzalloc(sizeof(struct flush_cmd_control), GFP_KERNEL);
if (!fcc)
return -ENOMEM;
@@ -488,6 +518,7 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi)
init_waitqueue_head(&fcc->flush_wait_queue);
init_llist_head(&fcc->issue_list);
SM_I(sbi)->cmd_control_info = fcc;
+init_thread:
fcc->f2fs_issue_flush = kthread_run(issue_flush_thread, sbi,
"f2fs_flush-%u:%u", MAJOR(dev), MINOR(dev));
if (IS_ERR(fcc->f2fs_issue_flush)) {
@@ -500,14 +531,20 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi)
return err;
}
-void destroy_flush_cmd_control(struct f2fs_sb_info *sbi)
+void destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free)
{
struct flush_cmd_control *fcc = SM_I(sbi)->cmd_control_info;
- if (fcc && fcc->f2fs_issue_flush)
- kthread_stop(fcc->f2fs_issue_flush);
- kfree(fcc);
- SM_I(sbi)->cmd_control_info = NULL;
+ if (fcc && fcc->f2fs_issue_flush) {
+ struct task_struct *flush_thread = fcc->f2fs_issue_flush;
+
+ fcc->f2fs_issue_flush = NULL;
+ kthread_stop(flush_thread);
+ }
+ if (free) {
+ kfree(fcc);
+ SM_I(sbi)->cmd_control_info = NULL;
+ }
}
static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno,
@@ -633,15 +670,23 @@ static void f2fs_submit_bio_wait_endio(struct bio *bio)
}
/* this function is copied from blkdev_issue_discard from block/blk-lib.c */
-int __f2fs_issue_discard_async(struct f2fs_sb_info *sbi, sector_t sector,
- sector_t nr_sects, gfp_t gfp_mask, unsigned long flags)
+static int __f2fs_issue_discard_async(struct f2fs_sb_info *sbi,
+ struct block_device *bdev, block_t blkstart, block_t blklen)
{
- struct block_device *bdev = sbi->sb->s_bdev;
struct bio *bio = NULL;
int err;
- err = __blkdev_issue_discard(bdev, sector, nr_sects, gfp_mask, flags,
- &bio);
+ trace_f2fs_issue_discard(sbi->sb, blkstart, blklen);
+
+ if (sbi->s_ndevs) {
+ int devi = f2fs_target_device_index(sbi, blkstart);
+
+ blkstart -= FDEV(devi).start_blk;
+ }
+ err = __blkdev_issue_discard(bdev,
+ SECTOR_FROM_BLOCK(blkstart),
+ SECTOR_FROM_BLOCK(blklen),
+ GFP_NOFS, 0, &bio);
if (!err && bio) {
struct bio_entry *be = __add_bio_entry(sbi, bio);
@@ -654,24 +699,101 @@ int __f2fs_issue_discard_async(struct f2fs_sb_info *sbi, sector_t sector,
return err;
}
+#ifdef CONFIG_BLK_DEV_ZONED
+static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi,
+ struct block_device *bdev, block_t blkstart, block_t blklen)
+{
+ sector_t nr_sects = SECTOR_FROM_BLOCK(blklen);
+ sector_t sector;
+ int devi = 0;
+
+ if (sbi->s_ndevs) {
+ devi = f2fs_target_device_index(sbi, blkstart);
+ blkstart -= FDEV(devi).start_blk;
+ }
+ sector = SECTOR_FROM_BLOCK(blkstart);
+
+ if (sector & (bdev_zone_size(bdev) - 1) ||
+ nr_sects != bdev_zone_size(bdev)) {
+ f2fs_msg(sbi->sb, KERN_INFO,
+ "(%d) %s: Unaligned discard attempted (block %x + %x)",
+ devi, sbi->s_ndevs ? FDEV(devi).path: "",
+ blkstart, blklen);
+ return -EIO;
+ }
+
+ /*
+ * We need to know the type of the zone: for conventional zones,
+ * use regular discard if the drive supports it. For sequential
+ * zones, reset the zone write pointer.
+ */
+ switch (get_blkz_type(sbi, bdev, blkstart)) {
+
+ case BLK_ZONE_TYPE_CONVENTIONAL:
+ if (!blk_queue_discard(bdev_get_queue(bdev)))
+ return 0;
+ return __f2fs_issue_discard_async(sbi, bdev, blkstart, blklen);
+ case BLK_ZONE_TYPE_SEQWRITE_REQ:
+ case BLK_ZONE_TYPE_SEQWRITE_PREF:
+ trace_f2fs_issue_reset_zone(sbi->sb, blkstart);
+ return blkdev_reset_zones(bdev, sector,
+ nr_sects, GFP_NOFS);
+ default:
+ /* Unknown zone type: broken device ? */
+ return -EIO;
+ }
+}
+#endif
+
+static int __issue_discard_async(struct f2fs_sb_info *sbi,
+ struct block_device *bdev, block_t blkstart, block_t blklen)
+{
+#ifdef CONFIG_BLK_DEV_ZONED
+ if (f2fs_sb_mounted_blkzoned(sbi->sb) &&
+ bdev_zoned_model(bdev) != BLK_ZONED_NONE)
+ return __f2fs_issue_discard_zone(sbi, bdev, blkstart, blklen);
+#endif
+ return __f2fs_issue_discard_async(sbi, bdev, blkstart, blklen);
+}
+
static int f2fs_issue_discard(struct f2fs_sb_info *sbi,
block_t blkstart, block_t blklen)
{
- sector_t start = SECTOR_FROM_BLOCK(blkstart);
- sector_t len = SECTOR_FROM_BLOCK(blklen);
+ sector_t start = blkstart, len = 0;
+ struct block_device *bdev;
struct seg_entry *se;
unsigned int offset;
block_t i;
+ int err = 0;
+
+ bdev = f2fs_target_device(sbi, blkstart, NULL);
+
+ for (i = blkstart; i < blkstart + blklen; i++, len++) {
+ if (i != start) {
+ struct block_device *bdev2 =
+ f2fs_target_device(sbi, i, NULL);
+
+ if (bdev2 != bdev) {
+ err = __issue_discard_async(sbi, bdev,
+ start, len);
+ if (err)
+ return err;
+ bdev = bdev2;
+ start = i;
+ len = 0;
+ }
+ }
- for (i = blkstart; i < blkstart + blklen; i++) {
se = get_seg_entry(sbi, GET_SEGNO(sbi, i));
offset = GET_BLKOFF_FROM_SEG0(sbi, i);
if (!f2fs_test_and_set_bit(offset, se->discard_map))
sbi->discard_blks--;
}
- trace_f2fs_issue_discard(sbi->sb, blkstart, blklen);
- return __f2fs_issue_discard_async(sbi, start, len, GFP_NOFS, 0);
+
+ if (len)
+ err = __issue_discard_async(sbi, bdev, start, len);
+ return err;
}
static void __add_discard_entry(struct f2fs_sb_info *sbi,
@@ -1296,25 +1418,21 @@ static void allocate_segment_by_default(struct f2fs_sb_info *sbi,
stat_inc_seg_type(sbi, curseg);
}
-static void __allocate_new_segments(struct f2fs_sb_info *sbi, int type)
-{
- struct curseg_info *curseg = CURSEG_I(sbi, type);
- unsigned int old_segno;
-
- old_segno = curseg->segno;
- SIT_I(sbi)->s_ops->allocate_segment(sbi, type, true);
- locate_dirty_segment(sbi, old_segno);
-}
-
void allocate_new_segments(struct f2fs_sb_info *sbi)
{
+ struct curseg_info *curseg;
+ unsigned int old_segno;
int i;
if (test_opt(sbi, LFS))
return;
- for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++)
- __allocate_new_segments(sbi, i);
+ for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) {
+ curseg = CURSEG_I(sbi, i);
+ old_segno = curseg->segno;
+ SIT_I(sbi)->s_ops->allocate_segment(sbi, i, true);
+ locate_dirty_segment(sbi, old_segno);
+ }
}
static const struct segment_allocation default_salloc_ops = {
@@ -1448,21 +1566,11 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
struct f2fs_summary *sum, int type)
{
struct sit_info *sit_i = SIT_I(sbi);
- struct curseg_info *curseg;
- bool direct_io = (type == CURSEG_DIRECT_IO);
-
- type = direct_io ? CURSEG_WARM_DATA : type;
-
- curseg = CURSEG_I(sbi, type);
+ struct curseg_info *curseg = CURSEG_I(sbi, type);
mutex_lock(&curseg->curseg_mutex);
mutex_lock(&sit_i->sentry_lock);
- /* direct_io'ed data is aligned to the segment for better performance */
- if (direct_io && curseg->next_blkoff &&
- !has_not_enough_free_secs(sbi, 0, 0))
- __allocate_new_segments(sbi, type);
-
*new_blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
/*
@@ -1515,7 +1623,7 @@ void write_meta_page(struct f2fs_sb_info *sbi, struct page *page)
.sbi = sbi,
.type = META,
.op = REQ_OP_WRITE,
- .op_flags = WRITE_SYNC | REQ_META | REQ_PRIO,
+ .op_flags = REQ_SYNC | REQ_META | REQ_PRIO,
.old_blkaddr = page->index,
.new_blkaddr = page->index,
.page = page,
@@ -2166,7 +2274,6 @@ out:
static int build_sit_info(struct f2fs_sb_info *sbi)
{
struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
- struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
struct sit_info *sit_i;
unsigned int sit_segs, start;
char *src_bitmap, *dst_bitmap;
@@ -2233,7 +2340,7 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
sit_i->sit_base_addr = le32_to_cpu(raw_super->sit_blkaddr);
sit_i->sit_blocks = sit_segs << sbi->log_blocks_per_seg;
- sit_i->written_valid_blocks = le64_to_cpu(ckpt->valid_block_count);
+ sit_i->written_valid_blocks = 0;
sit_i->sit_bitmap = dst_bitmap;
sit_i->bitmap_size = bitmap_size;
sit_i->dirty_sentries = 0;
@@ -2315,10 +2422,10 @@ static void build_sit_entries(struct f2fs_sb_info *sbi)
int sit_blk_cnt = SIT_BLK_CNT(sbi);
unsigned int i, start, end;
unsigned int readed, start_blk = 0;
- int nrpages = MAX_BIO_BLOCKS(sbi) * 8;
do {
- readed = ra_meta_pages(sbi, start_blk, nrpages, META_SIT, true);
+ readed = ra_meta_pages(sbi, start_blk, BIO_MAX_PAGES,
+ META_SIT, true);
start = start_blk * sit_i->sents_per_block;
end = (start_blk + readed) * sit_i->sents_per_block;
@@ -2387,6 +2494,9 @@ static void init_free_segmap(struct f2fs_sb_info *sbi)
struct seg_entry *sentry = get_seg_entry(sbi, start);
if (!sentry->valid_blocks)
__set_free(sbi, start);
+ else
+ SIT_I(sbi)->written_valid_blocks +=
+ sentry->valid_blocks;
}
/* set use the current segments */
@@ -2645,7 +2755,7 @@ void destroy_segment_manager(struct f2fs_sb_info *sbi)
if (!sm_info)
return;
- destroy_flush_cmd_control(sbi);
+ destroy_flush_cmd_control(sbi, true);
destroy_dirty_segmap(sbi);
destroy_curseg(sbi);
destroy_free_segmap(sbi);
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index fecb856ad874..9d44ce83acb2 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -18,6 +18,8 @@
#define DEF_RECLAIM_PREFREE_SEGMENTS 5 /* 5% over total segments */
#define DEF_MAX_RECLAIM_PREFREE_SEGMENTS 4096 /* 8GB in maximum */
+#define F2FS_MIN_SEGMENTS 9 /* SB + 2 (CP + SIT + NAT) + SSA + MAIN */
+
/* L: Logical segment # in volume, R: Relative segment # in main area */
#define GET_L2R_SEGNO(free_i, segno) (segno - free_i->start_segno)
#define GET_R2L_SEGNO(free_i, segno) (segno + free_i->start_segno)
@@ -102,8 +104,6 @@
(((sector_t)blk_addr) << F2FS_LOG_SECTORS_PER_BLOCK)
#define SECTOR_TO_BLOCK(sectors) \
(sectors >> F2FS_LOG_SECTORS_PER_BLOCK)
-#define MAX_BIO_BLOCKS(sbi) \
- ((int)min((int)max_hw_blocks(sbi), BIO_MAX_PAGES))
/*
* indicate a block allocation direction: RIGHT and LEFT.
@@ -471,11 +471,12 @@ static inline bool need_SSR(struct f2fs_sb_info *sbi)
{
int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES);
int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS);
+ int imeta_secs = get_blocktype_secs(sbi, F2FS_DIRTY_IMETA);
if (test_opt(sbi, LFS))
return false;
- return free_sections(sbi) <= (node_secs + 2 * dent_secs +
+ return free_sections(sbi) <= (node_secs + 2 * dent_secs + imeta_secs +
reserved_sections(sbi) + 1);
}
@@ -484,14 +485,14 @@ static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi,
{
int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES);
int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS);
-
- node_secs += get_blocktype_secs(sbi, F2FS_DIRTY_IMETA);
+ int imeta_secs = get_blocktype_secs(sbi, F2FS_DIRTY_IMETA);
if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
return false;
return (free_sections(sbi) + freed) <=
- (node_secs + 2 * dent_secs + reserved_sections(sbi) + needed);
+ (node_secs + 2 * dent_secs + imeta_secs +
+ reserved_sections(sbi) + needed);
}
static inline bool excess_prefree_segs(struct f2fs_sb_info *sbi)
@@ -695,13 +696,6 @@ static inline bool sec_usage_check(struct f2fs_sb_info *sbi, unsigned int secno)
return false;
}
-static inline unsigned int max_hw_blocks(struct f2fs_sb_info *sbi)
-{
- struct block_device *bdev = sbi->sb->s_bdev;
- struct request_queue *q = bdev_get_queue(bdev);
- return SECTOR_TO_BLOCK(queue_max_sectors(q));
-}
-
/*
* It is very important to gather dirty pages and write at once, so that we can
* submit a big bio without interfering other data writes.
@@ -719,7 +713,7 @@ static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type)
else if (type == NODE)
return 8 * sbi->blocks_per_seg;
else if (type == META)
- return 8 * MAX_BIO_BLOCKS(sbi);
+ return 8 * BIO_MAX_PAGES;
else
return 0;
}
@@ -736,11 +730,9 @@ static inline long nr_pages_to_write(struct f2fs_sb_info *sbi, int type,
return 0;
nr_to_write = wbc->nr_to_write;
-
+ desired = BIO_MAX_PAGES;
if (type == NODE)
- desired = 2 * max_hw_blocks(sbi);
- else
- desired = MAX_BIO_BLOCKS(sbi);
+ desired <<= 1;
wbc->nr_to_write = desired;
return desired - nr_to_write;
diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c
index 46c915425923..5c60fc28ec75 100644
--- a/fs/f2fs/shrinker.c
+++ b/fs/f2fs/shrinker.c
@@ -21,14 +21,16 @@ static unsigned int shrinker_run_no;
static unsigned long __count_nat_entries(struct f2fs_sb_info *sbi)
{
- return NM_I(sbi)->nat_cnt - NM_I(sbi)->dirty_nat_cnt;
+ long count = NM_I(sbi)->nat_cnt - NM_I(sbi)->dirty_nat_cnt;
+
+ return count > 0 ? count : 0;
}
static unsigned long __count_free_nids(struct f2fs_sb_info *sbi)
{
- if (NM_I(sbi)->fcnt > MAX_FREE_NIDS)
- return NM_I(sbi)->fcnt - MAX_FREE_NIDS;
- return 0;
+ long count = NM_I(sbi)->nid_cnt[FREE_NID_LIST] - MAX_FREE_NIDS;
+
+ return count > 0 ? count : 0;
}
static unsigned long __count_extent_cache(struct f2fs_sb_info *sbi)
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 6132b4ce4e4c..702638e21c76 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -412,14 +412,20 @@ static int parse_options(struct super_block *sb, char *options)
q = bdev_get_queue(sb->s_bdev);
if (blk_queue_discard(q)) {
set_opt(sbi, DISCARD);
- } else {
+ } else if (!f2fs_sb_mounted_blkzoned(sb)) {
f2fs_msg(sb, KERN_WARNING,
"mounting with \"discard\" option, but "
"the device does not support discard");
}
break;
case Opt_nodiscard:
+ if (f2fs_sb_mounted_blkzoned(sb)) {
+ f2fs_msg(sb, KERN_WARNING,
+ "discard is required for zoned block devices");
+ return -EINVAL;
+ }
clear_opt(sbi, DISCARD);
+ break;
case Opt_noheap:
set_opt(sbi, NOHEAP);
break;
@@ -512,6 +518,13 @@ static int parse_options(struct super_block *sb, char *options)
return -ENOMEM;
if (strlen(name) == 8 &&
!strncmp(name, "adaptive", 8)) {
+ if (f2fs_sb_mounted_blkzoned(sb)) {
+ f2fs_msg(sb, KERN_WARNING,
+ "adaptive mode is not allowed with "
+ "zoned block device feature");
+ kfree(name);
+ return -EINVAL;
+ }
set_opt_mode(sbi, F2FS_MOUNT_ADAPTIVE);
} else if (strlen(name) == 3 &&
!strncmp(name, "lfs", 3)) {
@@ -558,13 +571,9 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
init_once((void *) fi);
- if (percpu_counter_init(&fi->dirty_pages, 0, GFP_NOFS)) {
- kmem_cache_free(f2fs_inode_cachep, fi);
- return NULL;
- }
-
/* Initialize f2fs-specific inode info */
fi->vfs_inode.i_version = 1;
+ atomic_set(&fi->dirty_pages, 0);
fi->i_current_depth = 1;
fi->i_advise = 0;
init_rwsem(&fi->i_sem);
@@ -620,24 +629,25 @@ static int f2fs_drop_inode(struct inode *inode)
return generic_drop_inode(inode);
}
-int f2fs_inode_dirtied(struct inode *inode)
+int f2fs_inode_dirtied(struct inode *inode, bool sync)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ int ret = 0;
spin_lock(&sbi->inode_lock[DIRTY_META]);
if (is_inode_flag_set(inode, FI_DIRTY_INODE)) {
- spin_unlock(&sbi->inode_lock[DIRTY_META]);
- return 1;
+ ret = 1;
+ } else {
+ set_inode_flag(inode, FI_DIRTY_INODE);
+ stat_inc_dirty_inode(sbi, DIRTY_META);
}
-
- set_inode_flag(inode, FI_DIRTY_INODE);
- list_add_tail(&F2FS_I(inode)->gdirty_list,
+ if (sync && list_empty(&F2FS_I(inode)->gdirty_list)) {
+ list_add_tail(&F2FS_I(inode)->gdirty_list,
&sbi->inode_list[DIRTY_META]);
- inc_page_count(sbi, F2FS_DIRTY_IMETA);
- stat_inc_dirty_inode(sbi, DIRTY_META);
+ inc_page_count(sbi, F2FS_DIRTY_IMETA);
+ }
spin_unlock(&sbi->inode_lock[DIRTY_META]);
-
- return 0;
+ return ret;
}
void f2fs_inode_synced(struct inode *inode)
@@ -649,10 +659,12 @@ void f2fs_inode_synced(struct inode *inode)
spin_unlock(&sbi->inode_lock[DIRTY_META]);
return;
}
- list_del_init(&F2FS_I(inode)->gdirty_list);
+ if (!list_empty(&F2FS_I(inode)->gdirty_list)) {
+ list_del_init(&F2FS_I(inode)->gdirty_list);
+ dec_page_count(sbi, F2FS_DIRTY_IMETA);
+ }
clear_inode_flag(inode, FI_DIRTY_INODE);
clear_inode_flag(inode, FI_AUTO_RECOVER);
- dec_page_count(sbi, F2FS_DIRTY_IMETA);
stat_dec_dirty_inode(F2FS_I_SB(inode), DIRTY_META);
spin_unlock(&sbi->inode_lock[DIRTY_META]);
}
@@ -676,7 +688,7 @@ static void f2fs_dirty_inode(struct inode *inode, int flags)
if (is_inode_flag_set(inode, FI_AUTO_RECOVER))
clear_inode_flag(inode, FI_AUTO_RECOVER);
- f2fs_inode_dirtied(inode);
+ f2fs_inode_dirtied(inode, false);
}
static void f2fs_i_callback(struct rcu_head *head)
@@ -687,20 +699,28 @@ static void f2fs_i_callback(struct rcu_head *head)
static void f2fs_destroy_inode(struct inode *inode)
{
- percpu_counter_destroy(&F2FS_I(inode)->dirty_pages);
call_rcu(&inode->i_rcu, f2fs_i_callback);
}
static void destroy_percpu_info(struct f2fs_sb_info *sbi)
{
- int i;
-
- for (i = 0; i < NR_COUNT_TYPE; i++)
- percpu_counter_destroy(&sbi->nr_pages[i]);
percpu_counter_destroy(&sbi->alloc_valid_block_count);
percpu_counter_destroy(&sbi->total_valid_inode_count);
}
+static void destroy_device_list(struct f2fs_sb_info *sbi)
+{
+ int i;
+
+ for (i = 0; i < sbi->s_ndevs; i++) {
+ blkdev_put(FDEV(i).bdev, FMODE_EXCL);
+#ifdef CONFIG_BLK_DEV_ZONED
+ kfree(FDEV(i).blkz_type);
+#endif
+ }
+ kfree(sbi->devs);
+}
+
static void f2fs_put_super(struct super_block *sb)
{
struct f2fs_sb_info *sbi = F2FS_SB(sb);
@@ -738,7 +758,6 @@ static void f2fs_put_super(struct super_block *sb)
* In addition, EIO will skip do checkpoint, we need this as well.
*/
release_ino_entry(sbi, true);
- release_discard_addrs(sbi);
f2fs_leave_shrinker(sbi);
mutex_unlock(&sbi->umount_mutex);
@@ -762,6 +781,8 @@ static void f2fs_put_super(struct super_block *sb)
crypto_free_shash(sbi->s_chksum_driver);
kfree(sbi->raw_super);
+ destroy_device_list(sbi);
+
destroy_percpu_info(sbi);
kfree(sbi);
}
@@ -789,13 +810,17 @@ int f2fs_sync_fs(struct super_block *sb, int sync)
static int f2fs_freeze(struct super_block *sb)
{
- int err;
-
if (f2fs_readonly(sb))
return 0;
- err = f2fs_sync_fs(sb, 1);
- return err;
+ /* IO error happened before */
+ if (unlikely(f2fs_cp_error(F2FS_SB(sb))))
+ return -EIO;
+
+ /* must be clean, since sync_filesystem() was already called */
+ if (is_sbi_flag_set(F2FS_SB(sb), SBI_IS_DIRTY))
+ return -EINVAL;
+ return 0;
}
static int f2fs_unfreeze(struct super_block *sb)
@@ -822,7 +847,8 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_bavail = user_block_count - valid_user_blocks(sbi);
buf->f_files = sbi->total_node_count - F2FS_RESERVED_NODE_NUM;
- buf->f_ffree = buf->f_files - valid_inode_count(sbi);
+ buf->f_ffree = min(buf->f_files - valid_node_count(sbi),
+ buf->f_bavail);
buf->f_namelen = F2FS_NAME_LEN;
buf->f_fsid.val[0] = (u32)id;
@@ -974,7 +1000,7 @@ static void default_options(struct f2fs_sb_info *sbi)
set_opt(sbi, EXTENT_CACHE);
sbi->sb->s_flags |= MS_LAZYTIME;
set_opt(sbi, FLUSH_MERGE);
- if (f2fs_sb_mounted_hmsmr(sbi->sb)) {
+ if (f2fs_sb_mounted_blkzoned(sbi->sb)) {
set_opt_mode(sbi, F2FS_MOUNT_LFS);
set_opt(sbi, DISCARD);
} else {
@@ -1076,8 +1102,9 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
* or if flush_merge is not passed in mount option.
*/
if ((*flags & MS_RDONLY) || !test_opt(sbi, FLUSH_MERGE)) {
- destroy_flush_cmd_control(sbi);
- } else if (!SM_I(sbi)->cmd_control_info) {
+ clear_opt(sbi, FLUSH_MERGE);
+ destroy_flush_cmd_control(sbi, false);
+ } else {
err = create_flush_cmd_control(sbi);
if (err)
goto restore_gc;
@@ -1238,7 +1265,7 @@ static int __f2fs_commit_super(struct buffer_head *bh,
unlock_buffer(bh);
/* it's rare case, we can do fua all the time */
- return __sync_dirty_buffer(bh, WRITE_FLUSH_FUA);
+ return __sync_dirty_buffer(bh, REQ_PREFLUSH | REQ_FUA);
}
static inline bool sanity_check_area_boundary(struct f2fs_sb_info *sbi,
@@ -1426,6 +1453,7 @@ int sanity_check_ckpt(struct f2fs_sb_info *sbi)
unsigned int total, fsmeta;
struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
+ unsigned int ovp_segments, reserved_segments;
total = le32_to_cpu(raw_super->segment_count);
fsmeta = le32_to_cpu(raw_super->segment_count_ckpt);
@@ -1437,6 +1465,16 @@ int sanity_check_ckpt(struct f2fs_sb_info *sbi)
if (unlikely(fsmeta >= total))
return 1;
+ ovp_segments = le32_to_cpu(ckpt->overprov_segment_count);
+ reserved_segments = le32_to_cpu(ckpt->rsvd_segment_count);
+
+ if (unlikely(fsmeta < F2FS_MIN_SEGMENTS ||
+ ovp_segments == 0 || reserved_segments == 0)) {
+ f2fs_msg(sbi->sb, KERN_ERR,
+ "Wrong layout: check mkfs.f2fs version");
+ return 1;
+ }
+
if (unlikely(f2fs_cp_error(sbi))) {
f2fs_msg(sbi->sb, KERN_ERR, "A bug case: need to run fsck");
return 1;
@@ -1447,6 +1485,7 @@ int sanity_check_ckpt(struct f2fs_sb_info *sbi)
static void init_sb_info(struct f2fs_sb_info *sbi)
{
struct f2fs_super_block *raw_super = sbi->raw_super;
+ int i;
sbi->log_sectors_per_block =
le32_to_cpu(raw_super->log_sectors_per_block);
@@ -1471,6 +1510,9 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
sbi->interval_time[REQ_TIME] = DEF_IDLE_INTERVAL;
clear_sbi_flag(sbi, SBI_NEED_FSCK);
+ for (i = 0; i < NR_COUNT_TYPE; i++)
+ atomic_set(&sbi->nr_pages[i], 0);
+
INIT_LIST_HEAD(&sbi->s_list);
mutex_init(&sbi->umount_mutex);
mutex_init(&sbi->wio_mutex[NODE]);
@@ -1486,13 +1528,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
static int init_percpu_info(struct f2fs_sb_info *sbi)
{
- int i, err;
-
- for (i = 0; i < NR_COUNT_TYPE; i++) {
- err = percpu_counter_init(&sbi->nr_pages[i], 0, GFP_KERNEL);
- if (err)
- return err;
- }
+ int err;
err = percpu_counter_init(&sbi->alloc_valid_block_count, 0, GFP_KERNEL);
if (err)
@@ -1502,6 +1538,71 @@ static int init_percpu_info(struct f2fs_sb_info *sbi)
GFP_KERNEL);
}
+#ifdef CONFIG_BLK_DEV_ZONED
+static int init_blkz_info(struct f2fs_sb_info *sbi, int devi)
+{
+ struct block_device *bdev = FDEV(devi).bdev;
+ sector_t nr_sectors = bdev->bd_part->nr_sects;
+ sector_t sector = 0;
+ struct blk_zone *zones;
+ unsigned int i, nr_zones;
+ unsigned int n = 0;
+ int err = -EIO;
+
+ if (!f2fs_sb_mounted_blkzoned(sbi->sb))
+ return 0;
+
+ if (sbi->blocks_per_blkz && sbi->blocks_per_blkz !=
+ SECTOR_TO_BLOCK(bdev_zone_size(bdev)))
+ return -EINVAL;
+ sbi->blocks_per_blkz = SECTOR_TO_BLOCK(bdev_zone_size(bdev));
+ if (sbi->log_blocks_per_blkz && sbi->log_blocks_per_blkz !=
+ __ilog2_u32(sbi->blocks_per_blkz))
+ return -EINVAL;
+ sbi->log_blocks_per_blkz = __ilog2_u32(sbi->blocks_per_blkz);
+ FDEV(devi).nr_blkz = SECTOR_TO_BLOCK(nr_sectors) >>
+ sbi->log_blocks_per_blkz;
+ if (nr_sectors & (bdev_zone_size(bdev) - 1))
+ FDEV(devi).nr_blkz++;
+
+ FDEV(devi).blkz_type = kmalloc(FDEV(devi).nr_blkz, GFP_KERNEL);
+ if (!FDEV(devi).blkz_type)
+ return -ENOMEM;
+
+#define F2FS_REPORT_NR_ZONES 4096
+
+ zones = kcalloc(F2FS_REPORT_NR_ZONES, sizeof(struct blk_zone),
+ GFP_KERNEL);
+ if (!zones)
+ return -ENOMEM;
+
+ /* Get block zones type */
+ while (zones && sector < nr_sectors) {
+
+ nr_zones = F2FS_REPORT_NR_ZONES;
+ err = blkdev_report_zones(bdev, sector,
+ zones, &nr_zones,
+ GFP_KERNEL);
+ if (err)
+ break;
+ if (!nr_zones) {
+ err = -EIO;
+ break;
+ }
+
+ for (i = 0; i < nr_zones; i++) {
+ FDEV(devi).blkz_type[n] = zones[i].type;
+ sector += zones[i].len;
+ n++;
+ }
+ }
+
+ kfree(zones);
+
+ return err;
+}
+#endif
+
/*
* Read f2fs raw super block.
* Because we have two copies of super block, so read both of them
@@ -1594,6 +1695,77 @@ int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover)
return err;
}
+static int f2fs_scan_devices(struct f2fs_sb_info *sbi)
+{
+ struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
+ int i;
+
+ for (i = 0; i < MAX_DEVICES; i++) {
+ if (!RDEV(i).path[0])
+ return 0;
+
+ if (i == 0) {
+ sbi->devs = kzalloc(sizeof(struct f2fs_dev_info) *
+ MAX_DEVICES, GFP_KERNEL);
+ if (!sbi->devs)
+ return -ENOMEM;
+ }
+
+ memcpy(FDEV(i).path, RDEV(i).path, MAX_PATH_LEN);
+ FDEV(i).total_segments = le32_to_cpu(RDEV(i).total_segments);
+ if (i == 0) {
+ FDEV(i).start_blk = 0;
+ FDEV(i).end_blk = FDEV(i).start_blk +
+ (FDEV(i).total_segments <<
+ sbi->log_blocks_per_seg) - 1 +
+ le32_to_cpu(raw_super->segment0_blkaddr);
+ } else {
+ FDEV(i).start_blk = FDEV(i - 1).end_blk + 1;
+ FDEV(i).end_blk = FDEV(i).start_blk +
+ (FDEV(i).total_segments <<
+ sbi->log_blocks_per_seg) - 1;
+ }
+
+ FDEV(i).bdev = blkdev_get_by_path(FDEV(i).path,
+ sbi->sb->s_mode, sbi->sb->s_type);
+ if (IS_ERR(FDEV(i).bdev))
+ return PTR_ERR(FDEV(i).bdev);
+
+ /* to release errored devices */
+ sbi->s_ndevs = i + 1;
+
+#ifdef CONFIG_BLK_DEV_ZONED
+ if (bdev_zoned_model(FDEV(i).bdev) == BLK_ZONED_HM &&
+ !f2fs_sb_mounted_blkzoned(sbi->sb)) {
+ f2fs_msg(sbi->sb, KERN_ERR,
+ "Zoned block device feature not enabled\n");
+ return -EINVAL;
+ }
+ if (bdev_zoned_model(FDEV(i).bdev) != BLK_ZONED_NONE) {
+ if (init_blkz_info(sbi, i)) {
+ f2fs_msg(sbi->sb, KERN_ERR,
+ "Failed to initialize F2FS blkzone information");
+ return -EINVAL;
+ }
+ f2fs_msg(sbi->sb, KERN_INFO,
+ "Mount Device [%2d]: %20s, %8u, %8x - %8x (zone: %s)",
+ i, FDEV(i).path,
+ FDEV(i).total_segments,
+ FDEV(i).start_blk, FDEV(i).end_blk,
+ bdev_zoned_model(FDEV(i).bdev) == BLK_ZONED_HA ?
+ "Host-aware" : "Host-managed");
+ continue;
+ }
+#endif
+ f2fs_msg(sbi->sb, KERN_INFO,
+ "Mount Device [%2d]: %20s, %8u, %8x - %8x",
+ i, FDEV(i).path,
+ FDEV(i).total_segments,
+ FDEV(i).start_blk, FDEV(i).end_blk);
+ }
+ return 0;
+}
+
static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
{
struct f2fs_sb_info *sbi;
@@ -1641,6 +1813,18 @@ try_onemore:
sb->s_fs_info = sbi;
sbi->raw_super = raw_super;
+ /*
+ * The BLKZONED feature indicates that the drive was formatted with
+ * zone alignment optimization. This is optional for host-aware
+ * devices, but mandatory for host-managed zoned block devices.
+ */
+#ifndef CONFIG_BLK_DEV_ZONED
+ if (f2fs_sb_mounted_blkzoned(sb)) {
+ f2fs_msg(sb, KERN_ERR,
+ "Zoned block device support is not enabled\n");
+ goto free_sb_buf;
+ }
+#endif
default_options(sbi);
/* parse mount options */
options = kstrdup((const char *)data, GFP_KERNEL);
@@ -1710,6 +1894,13 @@ try_onemore:
goto free_meta_inode;
}
+ /* Initialize device list */
+ err = f2fs_scan_devices(sbi);
+ if (err) {
+ f2fs_msg(sb, KERN_ERR, "Failed to find devices");
+ goto free_devices;
+ }
+
sbi->total_valid_node_count =
le32_to_cpu(sbi->ckpt->valid_node_count);
percpu_counter_set(&sbi->total_valid_inode_count,
@@ -1893,12 +2084,21 @@ free_node_inode:
mutex_lock(&sbi->umount_mutex);
release_ino_entry(sbi, true);
f2fs_leave_shrinker(sbi);
+ /*
+ * Some dirty meta pages can be produced by recover_orphan_inodes()
+ * failed by EIO. Then, iput(node_inode) can trigger balance_fs_bg()
+ * followed by write_checkpoint() through f2fs_write_node_pages(), which
+ * falls into an infinite loop in sync_meta_pages().
+ */
+ truncate_inode_pages_final(META_MAPPING(sbi));
iput(sbi->node_inode);
mutex_unlock(&sbi->umount_mutex);
free_nm:
destroy_node_manager(sbi);
free_sm:
destroy_segment_manager(sbi);
+free_devices:
+ destroy_device_list(sbi);
kfree(sbi->ckpt);
free_meta_inode:
make_bad_inode(sbi->meta_inode);
@@ -2044,3 +2244,4 @@ module_exit(exit_f2fs_fs)
MODULE_AUTHOR("Samsung Electronics's Praesto Team");
MODULE_DESCRIPTION("Flash Friendly File System");
MODULE_LICENSE("GPL");
+
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 3e1c0280f866..c47ce2f330a1 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -106,7 +106,7 @@ static int f2fs_xattr_advise_set(const struct xattr_handler *handler,
return -EINVAL;
F2FS_I(inode)->i_advise |= *(char *)value;
- f2fs_mark_inode_dirty_sync(inode);
+ f2fs_mark_inode_dirty_sync(inode, true);
return 0;
}
@@ -554,7 +554,7 @@ static int __f2fs_setxattr(struct inode *inode, int index,
if (index == F2FS_XATTR_INDEX_ENCRYPTION &&
!strcmp(name, F2FS_XATTR_NAME_ENCRYPTION_CONTEXT))
f2fs_set_encrypted_inode(inode);
- f2fs_mark_inode_dirty_sync(inode);
+ f2fs_mark_inode_dirty_sync(inode, true);
if (!error && S_ISDIR(inode->i_mode))
set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_CP);
exit:
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 05713a5da083..ef600591d96f 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1769,15 +1769,13 @@ static long wb_writeback(struct bdi_writeback *wb,
* become available for writeback. Otherwise
* we'll just busyloop.
*/
- if (!list_empty(&wb->b_more_io)) {
- trace_writeback_wait(wb, work);
- inode = wb_inode(wb->b_more_io.prev);
- spin_lock(&inode->i_lock);
- spin_unlock(&wb->list_lock);
- /* This function drops i_lock... */
- inode_sleep_on_writeback(inode);
- spin_lock(&wb->list_lock);
- }
+ trace_writeback_wait(wb, work);
+ inode = wb_inode(wb->b_more_io.prev);
+ spin_lock(&inode->i_lock);
+ spin_unlock(&wb->list_lock);
+ /* This function drops i_lock... */
+ inode_sleep_on_writeback(inode);
+ spin_lock(&wb->list_lock);
}
spin_unlock(&wb->list_lock);
blk_finish_plug(&plug);
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index b3ebe512d64c..096f79997f75 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1739,8 +1739,6 @@ static int fuse_setattr(struct dentry *entry, struct iattr *attr)
* This should be done on write(), truncate() and chown().
*/
if (!fc->handle_killpriv) {
- int kill;
-
/*
* ia_mode calculation may have used stale i_mode.
* Refresh and recalculate.
@@ -1750,12 +1748,11 @@ static int fuse_setattr(struct dentry *entry, struct iattr *attr)
return ret;
attr->ia_mode = inode->i_mode;
- kill = should_remove_suid(entry);
- if (kill & ATTR_KILL_SUID) {
+ if (inode->i_mode & S_ISUID) {
attr->ia_valid |= ATTR_MODE;
attr->ia_mode &= ~S_ISUID;
}
- if (kill & ATTR_KILL_SGID) {
+ if ((inode->i_mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) {
attr->ia_valid |= ATTR_MODE;
attr->ia_mode &= ~S_ISGID;
}
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 3cdde5f5d399..79113219be5f 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -62,6 +62,7 @@
#include <linux/gfs2_ondisk.h>
#include <linux/crc32.h>
#include <linux/vmalloc.h>
+#include <linux/bio.h>
#include "gfs2.h"
#include "incore.h"
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index e58ccef09c91..27c00a16def0 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -657,7 +657,7 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags)
struct gfs2_log_header *lh;
unsigned int tail;
u32 hash;
- int op_flags = WRITE_FLUSH_FUA | REQ_META;
+ int op_flags = REQ_PREFLUSH | REQ_FUA | REQ_META;
struct page *page = mempool_alloc(gfs2_page_pool, GFP_NOIO);
enum gfs2_freeze_state state = atomic_read(&sdp->sd_freeze_state);
lh = page_address(page);
@@ -682,7 +682,7 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags)
if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags)) {
gfs2_ordered_wait(sdp);
log_flush_wait(sdp);
- op_flags = WRITE_SYNC | REQ_META | REQ_PRIO;
+ op_flags = REQ_SYNC | REQ_META | REQ_PRIO;
}
sdp->sd_log_idle = (tail == sdp->sd_log_flush_head);
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 49d5a1b61b06..b1f9144b42c7 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -231,7 +231,7 @@ static void gfs2_end_log_write(struct bio *bio)
* gfs2_log_flush_bio - Submit any pending log bio
* @sdp: The superblock
* @op: REQ_OP
- * @op_flags: rq_flag_bits
+ * @op_flags: req_flag_bits
*
* Submit any pending part-built or full bio to the block device. If
* there is no pending bio, then this is a no-op.
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 373639a59782..49db8ef13fdf 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -37,8 +37,7 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb
{
struct buffer_head *bh, *head;
int nr_underway = 0;
- int write_flags = REQ_META | REQ_PRIO |
- (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : 0);
+ int write_flags = REQ_META | REQ_PRIO | wbc_to_write_flags(wbc);
BUG_ON(!PageLocked(page));
BUG_ON(!page_has_buffers(page));
@@ -285,7 +284,7 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
}
}
- gfs2_submit_bhs(REQ_OP_READ, READ_SYNC | REQ_META | REQ_PRIO, bhs, num);
+ gfs2_submit_bhs(REQ_OP_READ, REQ_META | REQ_PRIO, bhs, num);
if (!(flags & DIO_WAIT))
return 0;
@@ -453,7 +452,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen)
if (buffer_uptodate(first_bh))
goto out;
if (!buffer_locked(first_bh))
- ll_rw_block(REQ_OP_READ, READ_SYNC | REQ_META, 1, &first_bh);
+ ll_rw_block(REQ_OP_READ, REQ_META, 1, &first_bh);
dblock++;
extlen--;
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index ff72ac6439c8..a34308df927f 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -246,7 +246,7 @@ static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector, int silent)
bio->bi_end_io = end_bio_io_page;
bio->bi_private = page;
- bio_set_op_attrs(bio, REQ_OP_READ, READ_SYNC | REQ_META);
+ bio_set_op_attrs(bio, REQ_OP_READ, REQ_META);
submit_bio(bio);
wait_on_page_locked(page);
bio_put(bio);
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 11854dd84572..67aedf4c2e7c 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -221,7 +221,7 @@ static int hfsplus_sync_fs(struct super_block *sb, int wait)
error2 = hfsplus_submit_bio(sb,
sbi->part_start + HFSPLUS_VOLHEAD_SECTOR,
sbi->s_vhdr_buf, NULL, REQ_OP_WRITE,
- WRITE_SYNC);
+ REQ_SYNC);
if (!error)
error = error2;
if (!write_backup)
@@ -230,7 +230,7 @@ static int hfsplus_sync_fs(struct super_block *sb, int wait)
error2 = hfsplus_submit_bio(sb,
sbi->part_start + sbi->sect_count - 2,
sbi->s_backup_vhdr_buf, NULL, REQ_OP_WRITE,
- WRITE_SYNC);
+ REQ_SYNC);
if (!error)
error2 = error;
out:
diff --git a/fs/internal.h b/fs/internal.h
index f4da3341b4a3..4fcf51766d4a 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -184,3 +184,6 @@ typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len,
loff_t iomap_apply(struct inode *inode, loff_t pos, loff_t length,
unsigned flags, struct iomap_ops *ops, void *data,
iomap_actor_t actor);
+
+/* direct-io.c: */
+int sb_init_dio_done_wq(struct super_block *sb);
diff --git a/fs/iomap.c b/fs/iomap.c
index a8ee8c33ca78..354a123f170e 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -24,6 +24,7 @@
#include <linux/uio.h>
#include <linux/backing-dev.h>
#include <linux/buffer_head.h>
+#include <linux/task_io_accounting_ops.h>
#include <linux/dax.h>
#include "internal.h"
@@ -467,8 +468,9 @@ int iomap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
offset = page_offset(page);
while (length > 0) {
- ret = iomap_apply(inode, offset, length, IOMAP_WRITE,
- ops, page, iomap_page_mkwrite_actor);
+ ret = iomap_apply(inode, offset, length,
+ IOMAP_WRITE | IOMAP_FAULT, ops, page,
+ iomap_page_mkwrite_actor);
if (unlikely(ret <= 0))
goto out_unlock;
offset += ret;
@@ -583,3 +585,375 @@ int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi,
return 0;
}
EXPORT_SYMBOL_GPL(iomap_fiemap);
+
+/*
+ * Private flags for iomap_dio, must not overlap with the public ones in
+ * iomap.h:
+ */
+#define IOMAP_DIO_WRITE (1 << 30)
+#define IOMAP_DIO_DIRTY (1 << 31)
+
+struct iomap_dio {
+ struct kiocb *iocb;
+ iomap_dio_end_io_t *end_io;
+ loff_t i_size;
+ loff_t size;
+ atomic_t ref;
+ unsigned flags;
+ int error;
+
+ union {
+ /* used during submission and for synchronous completion: */
+ struct {
+ struct iov_iter *iter;
+ struct task_struct *waiter;
+ struct request_queue *last_queue;
+ blk_qc_t cookie;
+ } submit;
+
+ /* used for aio completion: */
+ struct {
+ struct work_struct work;
+ } aio;
+ };
+};
+
+static ssize_t iomap_dio_complete(struct iomap_dio *dio)
+{
+ struct kiocb *iocb = dio->iocb;
+ ssize_t ret;
+
+ if (dio->end_io) {
+ ret = dio->end_io(iocb,
+ dio->error ? dio->error : dio->size,
+ dio->flags);
+ } else {
+ ret = dio->error;
+ }
+
+ if (likely(!ret)) {
+ ret = dio->size;
+ /* check for short read */
+ if (iocb->ki_pos + ret > dio->i_size &&
+ !(dio->flags & IOMAP_DIO_WRITE))
+ ret = dio->i_size - iocb->ki_pos;
+ iocb->ki_pos += ret;
+ }
+
+ inode_dio_end(file_inode(iocb->ki_filp));
+ kfree(dio);
+
+ return ret;
+}
+
+static void iomap_dio_complete_work(struct work_struct *work)
+{
+ struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work);
+ struct kiocb *iocb = dio->iocb;
+ bool is_write = (dio->flags & IOMAP_DIO_WRITE);
+ ssize_t ret;
+
+ ret = iomap_dio_complete(dio);
+ if (is_write && ret > 0)
+ ret = generic_write_sync(iocb, ret);
+ iocb->ki_complete(iocb, ret, 0);
+}
+
+/*
+ * Set an error in the dio if none is set yet. We have to use cmpxchg
+ * as the submission context and the completion context(s) can race to
+ * update the error.
+ */
+static inline void iomap_dio_set_error(struct iomap_dio *dio, int ret)
+{
+ cmpxchg(&dio->error, 0, ret);
+}
+
+static void iomap_dio_bio_end_io(struct bio *bio)
+{
+ struct iomap_dio *dio = bio->bi_private;
+ bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY);
+
+ if (bio->bi_error)
+ iomap_dio_set_error(dio, bio->bi_error);
+
+ if (atomic_dec_and_test(&dio->ref)) {
+ if (is_sync_kiocb(dio->iocb)) {
+ struct task_struct *waiter = dio->submit.waiter;
+
+ WRITE_ONCE(dio->submit.waiter, NULL);
+ wake_up_process(waiter);
+ } else if (dio->flags & IOMAP_DIO_WRITE) {
+ struct inode *inode = file_inode(dio->iocb->ki_filp);
+
+ INIT_WORK(&dio->aio.work, iomap_dio_complete_work);
+ queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work);
+ } else {
+ iomap_dio_complete_work(&dio->aio.work);
+ }
+ }
+
+ if (should_dirty) {
+ bio_check_pages_dirty(bio);
+ } else {
+ struct bio_vec *bvec;
+ int i;
+
+ bio_for_each_segment_all(bvec, bio, i)
+ put_page(bvec->bv_page);
+ bio_put(bio);
+ }
+}
+
+static blk_qc_t
+iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos,
+ unsigned len)
+{
+ struct page *page = ZERO_PAGE(0);
+ struct bio *bio;
+
+ bio = bio_alloc(GFP_KERNEL, 1);
+ bio->bi_bdev = iomap->bdev;
+ bio->bi_iter.bi_sector =
+ iomap->blkno + ((pos - iomap->offset) >> 9);
+ bio->bi_private = dio;
+ bio->bi_end_io = iomap_dio_bio_end_io;
+
+ get_page(page);
+ if (bio_add_page(bio, page, len, 0) != len)
+ BUG();
+ bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC | REQ_IDLE);
+
+ atomic_inc(&dio->ref);
+ return submit_bio(bio);
+}
+
+static loff_t
+iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
+ void *data, struct iomap *iomap)
+{
+ struct iomap_dio *dio = data;
+ unsigned blkbits = blksize_bits(bdev_logical_block_size(iomap->bdev));
+ unsigned fs_block_size = (1 << inode->i_blkbits), pad;
+ unsigned align = iov_iter_alignment(dio->submit.iter);
+ struct iov_iter iter;
+ struct bio *bio;
+ bool need_zeroout = false;
+ int nr_pages, ret;
+
+ if ((pos | length | align) & ((1 << blkbits) - 1))
+ return -EINVAL;
+
+ switch (iomap->type) {
+ case IOMAP_HOLE:
+ if (WARN_ON_ONCE(dio->flags & IOMAP_DIO_WRITE))
+ return -EIO;
+ /*FALLTHRU*/
+ case IOMAP_UNWRITTEN:
+ if (!(dio->flags & IOMAP_DIO_WRITE)) {
+ iov_iter_zero(length, dio->submit.iter);
+ dio->size += length;
+ return length;
+ }
+ dio->flags |= IOMAP_DIO_UNWRITTEN;
+ need_zeroout = true;
+ break;
+ case IOMAP_MAPPED:
+ if (iomap->flags & IOMAP_F_SHARED)
+ dio->flags |= IOMAP_DIO_COW;
+ if (iomap->flags & IOMAP_F_NEW)
+ need_zeroout = true;
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ return -EIO;
+ }
+
+ /*
+ * Operate on a partial iter trimmed to the extent we were called for.
+ * We'll update the iter in the dio once we're done with this extent.
+ */
+ iter = *dio->submit.iter;
+ iov_iter_truncate(&iter, length);
+
+ nr_pages = iov_iter_npages(&iter, BIO_MAX_PAGES);
+ if (nr_pages <= 0)
+ return nr_pages;
+
+ if (need_zeroout) {
+ /* zero out from the start of the block to the write offset */
+ pad = pos & (fs_block_size - 1);
+ if (pad)
+ iomap_dio_zero(dio, iomap, pos - pad, pad);
+ }
+
+ do {
+ if (dio->error)
+ return 0;
+
+ bio = bio_alloc(GFP_KERNEL, nr_pages);
+ bio->bi_bdev = iomap->bdev;
+ bio->bi_iter.bi_sector =
+ iomap->blkno + ((pos - iomap->offset) >> 9);
+ bio->bi_private = dio;
+ bio->bi_end_io = iomap_dio_bio_end_io;
+
+ ret = bio_iov_iter_get_pages(bio, &iter);
+ if (unlikely(ret)) {
+ bio_put(bio);
+ return ret;
+ }
+
+ if (dio->flags & IOMAP_DIO_WRITE) {
+ bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC | REQ_IDLE);
+ task_io_account_write(bio->bi_iter.bi_size);
+ } else {
+ bio_set_op_attrs(bio, REQ_OP_READ, 0);
+ if (dio->flags & IOMAP_DIO_DIRTY)
+ bio_set_pages_dirty(bio);
+ }
+
+ dio->size += bio->bi_iter.bi_size;
+ pos += bio->bi_iter.bi_size;
+
+ nr_pages = iov_iter_npages(&iter, BIO_MAX_PAGES);
+
+ atomic_inc(&dio->ref);
+
+ dio->submit.last_queue = bdev_get_queue(iomap->bdev);
+ dio->submit.cookie = submit_bio(bio);
+ } while (nr_pages);
+
+ if (need_zeroout) {
+ /* zero out from the end of the write to the end of the block */
+ pad = pos & (fs_block_size - 1);
+ if (pad)
+ iomap_dio_zero(dio, iomap, pos, fs_block_size - pad);
+ }
+
+ iov_iter_advance(dio->submit.iter, length);
+ return length;
+}
+
+ssize_t
+iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, struct iomap_ops *ops,
+ iomap_dio_end_io_t end_io)
+{
+ struct address_space *mapping = iocb->ki_filp->f_mapping;
+ struct inode *inode = file_inode(iocb->ki_filp);
+ size_t count = iov_iter_count(iter);
+ loff_t pos = iocb->ki_pos, end = iocb->ki_pos + count - 1, ret = 0;
+ unsigned int flags = IOMAP_DIRECT;
+ struct blk_plug plug;
+ struct iomap_dio *dio;
+
+ lockdep_assert_held(&inode->i_rwsem);
+
+ if (!count)
+ return 0;
+
+ dio = kmalloc(sizeof(*dio), GFP_KERNEL);
+ if (!dio)
+ return -ENOMEM;
+
+ dio->iocb = iocb;
+ atomic_set(&dio->ref, 1);
+ dio->size = 0;
+ dio->i_size = i_size_read(inode);
+ dio->end_io = end_io;
+ dio->error = 0;
+ dio->flags = 0;
+
+ dio->submit.iter = iter;
+ if (is_sync_kiocb(iocb)) {
+ dio->submit.waiter = current;
+ dio->submit.cookie = BLK_QC_T_NONE;
+ dio->submit.last_queue = NULL;
+ }
+
+ if (iov_iter_rw(iter) == READ) {
+ if (pos >= dio->i_size)
+ goto out_free_dio;
+
+ if (iter->type == ITER_IOVEC)
+ dio->flags |= IOMAP_DIO_DIRTY;
+ } else {
+ dio->flags |= IOMAP_DIO_WRITE;
+ flags |= IOMAP_WRITE;
+ }
+
+ if (mapping->nrpages) {
+ ret = filemap_write_and_wait_range(mapping, iocb->ki_pos, end);
+ if (ret)
+ goto out_free_dio;
+
+ ret = invalidate_inode_pages2_range(mapping,
+ iocb->ki_pos >> PAGE_SHIFT, end >> PAGE_SHIFT);
+ WARN_ON_ONCE(ret);
+ ret = 0;
+ }
+
+ inode_dio_begin(inode);
+
+ blk_start_plug(&plug);
+ do {
+ ret = iomap_apply(inode, pos, count, flags, ops, dio,
+ iomap_dio_actor);
+ if (ret <= 0) {
+ /* magic error code to fall back to buffered I/O */
+ if (ret == -ENOTBLK)
+ ret = 0;
+ break;
+ }
+ pos += ret;
+ } while ((count = iov_iter_count(iter)) > 0);
+ blk_finish_plug(&plug);
+
+ if (ret < 0)
+ iomap_dio_set_error(dio, ret);
+
+ if (ret >= 0 && iov_iter_rw(iter) == WRITE && !is_sync_kiocb(iocb) &&
+ !inode->i_sb->s_dio_done_wq) {
+ ret = sb_init_dio_done_wq(inode->i_sb);
+ if (ret < 0)
+ iomap_dio_set_error(dio, ret);
+ }
+
+ if (!atomic_dec_and_test(&dio->ref)) {
+ if (!is_sync_kiocb(iocb))
+ return -EIOCBQUEUED;
+
+ for (;;) {
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ if (!READ_ONCE(dio->submit.waiter))
+ break;
+
+ if (!(iocb->ki_flags & IOCB_HIPRI) ||
+ !dio->submit.last_queue ||
+ !blk_mq_poll(dio->submit.last_queue,
+ dio->submit.cookie))
+ io_schedule();
+ }
+ __set_current_state(TASK_RUNNING);
+ }
+
+ /*
+ * Try again to invalidate clean pages which might have been cached by
+ * non-direct readahead, or faulted in by get_user_pages() if the source
+ * of the write was an mmap'ed region of the file we're writing. Either
+ * one is a pretty crazy thing to do, so we don't support it 100%. If
+ * this invalidation fails, tough, the write still worked...
+ */
+ if (iov_iter_rw(iter) == WRITE && mapping->nrpages) {
+ ret = invalidate_inode_pages2_range(mapping,
+ iocb->ki_pos >> PAGE_SHIFT, end >> PAGE_SHIFT);
+ WARN_ON_ONCE(ret);
+ }
+
+ return iomap_dio_complete(dio);
+
+out_free_dio:
+ kfree(dio);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(iomap_dio_rw);
diff --git a/fs/isofs/compress.c b/fs/isofs/compress.c
index 44af14b2e916..9bb2fe35799d 100644
--- a/fs/isofs/compress.c
+++ b/fs/isofs/compress.c
@@ -18,6 +18,7 @@
#include <linux/module.h>
#include <linux/init.h>
+#include <linux/bio.h>
#include <linux/vmalloc.h>
#include <linux/zlib.h>
diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c
index 98b3eb7d8eaf..0ec137310320 100644
--- a/fs/isofs/rock.c
+++ b/fs/isofs/rock.c
@@ -377,9 +377,9 @@ repeat:
{
int p;
for (p = 0; p < rr->u.ER.len_id; p++)
- printk("%c", rr->u.ER.data[p]);
+ printk(KERN_CONT "%c", rr->u.ER.data[p]);
}
- printk("\n");
+ printk(KERN_CONT "\n");
break;
case SIG('P', 'X'):
inode->i_mode = isonum_733(rr->u.PX.mode);
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 684996c8a3a4..4055f51617ef 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -186,7 +186,7 @@ __flush_batch(journal_t *journal, int *batch_count)
blk_start_plug(&plug);
for (i = 0; i < *batch_count; i++)
- write_dirty_buffer(journal->j_chkpt_bhs[i], WRITE_SYNC);
+ write_dirty_buffer(journal->j_chkpt_bhs[i], REQ_SYNC);
blk_finish_plug(&plug);
for (i = 0; i < *batch_count; i++) {
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 31f8ca046639..8c514367ba5a 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -155,9 +155,10 @@ static int journal_submit_commit_record(journal_t *journal,
if (journal->j_flags & JBD2_BARRIER &&
!jbd2_has_feature_async_commit(journal))
- ret = submit_bh(REQ_OP_WRITE, WRITE_SYNC | WRITE_FLUSH_FUA, bh);
+ ret = submit_bh(REQ_OP_WRITE,
+ REQ_SYNC | REQ_PREFLUSH | REQ_FUA, bh);
else
- ret = submit_bh(REQ_OP_WRITE, WRITE_SYNC, bh);
+ ret = submit_bh(REQ_OP_WRITE, REQ_SYNC, bh);
*cbh = bh;
return ret;
@@ -402,7 +403,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
jbd2_journal_update_sb_log_tail(journal,
journal->j_tail_sequence,
journal->j_tail,
- WRITE_SYNC);
+ REQ_SYNC);
mutex_unlock(&journal->j_checkpoint_mutex);
} else {
jbd_debug(3, "superblock not updated\n");
@@ -717,7 +718,7 @@ start_journal_io:
clear_buffer_dirty(bh);
set_buffer_uptodate(bh);
bh->b_end_io = journal_end_buffer_io_sync;
- submit_bh(REQ_OP_WRITE, WRITE_SYNC, bh);
+ submit_bh(REQ_OP_WRITE, REQ_SYNC, bh);
}
cond_resched();
stats.run.rs_blocks_logged += bufs;
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 927da4956a89..8ed971eeab44 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -913,7 +913,7 @@ int __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block)
* space and if we lose sb update during power failure we'd replay
* old transaction with possibly newly overwritten data.
*/
- ret = jbd2_journal_update_sb_log_tail(journal, tid, block, WRITE_FUA);
+ ret = jbd2_journal_update_sb_log_tail(journal, tid, block, REQ_FUA);
if (ret)
goto out;
@@ -1306,7 +1306,7 @@ static int journal_reset(journal_t *journal)
/* Lock here to make assertions happy... */
mutex_lock(&journal->j_checkpoint_mutex);
/*
- * Update log tail information. We use WRITE_FUA since new
+ * Update log tail information. We use REQ_FUA since new
* transaction will start reusing journal space and so we
* must make sure information about current log tail is on
* disk before that.
@@ -1314,7 +1314,7 @@ static int journal_reset(journal_t *journal)
jbd2_journal_update_sb_log_tail(journal,
journal->j_tail_sequence,
journal->j_tail,
- WRITE_FUA);
+ REQ_FUA);
mutex_unlock(&journal->j_checkpoint_mutex);
}
return jbd2_journal_start_thread(journal);
@@ -1454,7 +1454,7 @@ void jbd2_journal_update_sb_errno(journal_t *journal)
sb->s_errno = cpu_to_be32(journal->j_errno);
read_unlock(&journal->j_state_lock);
- jbd2_write_superblock(journal, WRITE_FUA);
+ jbd2_write_superblock(journal, REQ_FUA);
}
EXPORT_SYMBOL(jbd2_journal_update_sb_errno);
@@ -1720,7 +1720,8 @@ int jbd2_journal_destroy(journal_t *journal)
++journal->j_transaction_sequence;
write_unlock(&journal->j_state_lock);
- jbd2_mark_journal_empty(journal, WRITE_FLUSH_FUA);
+ jbd2_mark_journal_empty(journal,
+ REQ_PREFLUSH | REQ_FUA);
mutex_unlock(&journal->j_checkpoint_mutex);
} else
err = -EIO;
@@ -1979,7 +1980,7 @@ int jbd2_journal_flush(journal_t *journal)
* the magic code for a fully-recovered superblock. Any future
* commits of data to the journal will restore the current
* s_start value. */
- jbd2_mark_journal_empty(journal, WRITE_FUA);
+ jbd2_mark_journal_empty(journal, REQ_FUA);
mutex_unlock(&journal->j_checkpoint_mutex);
write_lock(&journal->j_state_lock);
J_ASSERT(!journal->j_running_transaction);
@@ -2025,7 +2026,7 @@ int jbd2_journal_wipe(journal_t *journal, int write)
if (write) {
/* Lock to make assertions happy... */
mutex_lock(&journal->j_checkpoint_mutex);
- jbd2_mark_journal_empty(journal, WRITE_FUA);
+ jbd2_mark_journal_empty(journal, REQ_FUA);
mutex_unlock(&journal->j_checkpoint_mutex);
}
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index 91171dc352cb..cfc38b552118 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -648,7 +648,7 @@ static void flush_descriptor(journal_t *journal,
set_buffer_jwrite(descriptor);
BUFFER_TRACE(descriptor, "write");
set_buffer_dirty(descriptor);
- write_dirty_buffer(descriptor, WRITE_SYNC);
+ write_dirty_buffer(descriptor, REQ_SYNC);
}
#endif
diff --git a/fs/jfs/ioctl.c b/fs/jfs/ioctl.c
index 8653cac7e12e..b6fd1ff29ddf 100644
--- a/fs/jfs/ioctl.c
+++ b/fs/jfs/ioctl.c
@@ -121,7 +121,7 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
jfs_set_inode_flags(inode);
inode_unlock(inode);
- inode->i_ctime = CURRENT_TIME_SEC;
+ inode->i_ctime = current_time(inode);
mark_inode_dirty(inode);
setflags_out:
mnt_drop_write_file(filp);
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index a21ea8b3e5fa..bb1da1feafeb 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -2002,7 +2002,7 @@ static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp)
bio->bi_end_io = lbmIODone;
bio->bi_private = bp;
- bio_set_op_attrs(bio, REQ_OP_READ, READ_SYNC);
+ bio->bi_opf = REQ_OP_READ;
/*check if journaling to disk has been disabled*/
if (log->no_integrity) {
bio->bi_iter.bi_size = 0;
@@ -2146,7 +2146,7 @@ static void lbmStartIO(struct lbuf * bp)
bio->bi_end_io = lbmIODone;
bio->bi_private = bp;
- bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_SYNC);
+ bio->bi_opf = REQ_OP_WRITE | REQ_SYNC;
/* check if journaling to disk has been disabled */
if (log->no_integrity) {
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index a1982118f92f..ac9e108ce1ea 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -335,7 +335,7 @@ static int kernfs_xattr_set(const struct xattr_handler *handler,
return simple_xattr_set(&attrs->xattrs, name, value, size, flags);
}
-const struct xattr_handler kernfs_trusted_xattr_handler = {
+static const struct xattr_handler kernfs_trusted_xattr_handler = {
.prefix = XATTR_TRUSTED_PREFIX,
.get = kernfs_xattr_get,
.set = kernfs_xattr_set,
@@ -372,7 +372,7 @@ static int kernfs_security_xattr_set(const struct xattr_handler *handler,
return error;
}
-const struct xattr_handler kernfs_security_xattr_handler = {
+static const struct xattr_handler kernfs_security_xattr_handler = {
.prefix = XATTR_SECURITY_PREFIX,
.get = kernfs_xattr_get,
.set = kernfs_security_xattr_set,
diff --git a/fs/lockd/netns.h b/fs/lockd/netns.h
index 5426189406c1..fb8cac88251a 100644
--- a/fs/lockd/netns.h
+++ b/fs/lockd/netns.h
@@ -15,6 +15,6 @@ struct lockd_net {
struct list_head nsm_handles;
};
-extern int lockd_net_id;
+extern unsigned int lockd_net_id;
#endif
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index fc4084ef4736..1c13dd80744f 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -57,7 +57,7 @@ static struct task_struct *nlmsvc_task;
static struct svc_rqst *nlmsvc_rqst;
unsigned long nlmsvc_timeout;
-int lockd_net_id;
+unsigned int lockd_net_id;
/*
* These can be set at insmod time (useful for NFS as root filesystem),
diff --git a/fs/mbcache.c b/fs/mbcache.c
index c5bd19ffa326..b19be429d655 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -29,7 +29,7 @@ struct mb_cache {
/* log2 of hash table size */
int c_bucket_bits;
/* Maximum entries in cache to avoid degrading hash too much */
- int c_max_entries;
+ unsigned long c_max_entries;
/* Protects c_list, c_entry_count */
spinlock_t c_list_lock;
struct list_head c_list;
@@ -43,7 +43,7 @@ struct mb_cache {
static struct kmem_cache *mb_entry_cache;
static unsigned long mb_cache_shrink(struct mb_cache *cache,
- unsigned int nr_to_scan);
+ unsigned long nr_to_scan);
static inline struct hlist_bl_head *mb_cache_entry_head(struct mb_cache *cache,
u32 key)
@@ -155,12 +155,12 @@ out:
}
/*
- * mb_cache_entry_find_first - find the first entry in cache with given key
+ * mb_cache_entry_find_first - find the first reusable entry with the given key
* @cache: cache where we should search
* @key: key to look for
*
- * Search in @cache for entry with key @key. Grabs reference to the first
- * entry found and returns the entry.
+ * Search in @cache for a reusable entry with key @key. Grabs reference to the
+ * first reusable entry found and returns the entry.
*/
struct mb_cache_entry *mb_cache_entry_find_first(struct mb_cache *cache,
u32 key)
@@ -170,14 +170,14 @@ struct mb_cache_entry *mb_cache_entry_find_first(struct mb_cache *cache,
EXPORT_SYMBOL(mb_cache_entry_find_first);
/*
- * mb_cache_entry_find_next - find next entry in cache with the same
+ * mb_cache_entry_find_next - find next reusable entry with the same key
* @cache: cache where we should search
* @entry: entry to start search from
*
- * Finds next entry in the hash chain which has the same key as @entry.
- * If @entry is unhashed (which can happen when deletion of entry races
- * with the search), finds the first entry in the hash chain. The function
- * drops reference to @entry and returns with a reference to the found entry.
+ * Finds next reusable entry in the hash chain which has the same key as @entry.
+ * If @entry is unhashed (which can happen when deletion of entry races with the
+ * search), finds the first reusable entry in the hash chain. The function drops
+ * reference to @entry and returns with a reference to the found entry.
*/
struct mb_cache_entry *mb_cache_entry_find_next(struct mb_cache *cache,
struct mb_cache_entry *entry)
@@ -274,11 +274,11 @@ static unsigned long mb_cache_count(struct shrinker *shrink,
/* Shrink number of entries in cache */
static unsigned long mb_cache_shrink(struct mb_cache *cache,
- unsigned int nr_to_scan)
+ unsigned long nr_to_scan)
{
struct mb_cache_entry *entry;
struct hlist_bl_head *head;
- unsigned int shrunk = 0;
+ unsigned long shrunk = 0;
spin_lock(&cache->c_list_lock);
while (nr_to_scan-- && !list_empty(&cache->c_list)) {
@@ -286,7 +286,7 @@ static unsigned long mb_cache_shrink(struct mb_cache *cache,
struct mb_cache_entry, e_list);
if (entry->e_referenced) {
entry->e_referenced = 0;
- list_move_tail(&cache->c_list, &entry->e_list);
+ list_move_tail(&entry->e_list, &cache->c_list);
continue;
}
list_del_init(&entry->e_list);
@@ -316,10 +316,9 @@ static unsigned long mb_cache_shrink(struct mb_cache *cache,
static unsigned long mb_cache_scan(struct shrinker *shrink,
struct shrink_control *sc)
{
- int nr_to_scan = sc->nr_to_scan;
struct mb_cache *cache = container_of(shrink, struct mb_cache,
c_shrink);
- return mb_cache_shrink(cache, nr_to_scan);
+ return mb_cache_shrink(cache, sc->nr_to_scan);
}
/* We shrink 1/X of the cache when we have too many entries in it */
@@ -341,11 +340,8 @@ static void mb_cache_shrink_worker(struct work_struct *work)
struct mb_cache *mb_cache_create(int bucket_bits)
{
struct mb_cache *cache;
- int bucket_count = 1 << bucket_bits;
- int i;
-
- if (!try_module_get(THIS_MODULE))
- return NULL;
+ unsigned long bucket_count = 1UL << bucket_bits;
+ unsigned long i;
cache = kzalloc(sizeof(struct mb_cache), GFP_KERNEL);
if (!cache)
@@ -377,7 +373,6 @@ struct mb_cache *mb_cache_create(int bucket_bits)
return cache;
err_out:
- module_put(THIS_MODULE);
return NULL;
}
EXPORT_SYMBOL(mb_cache_create);
@@ -411,7 +406,6 @@ void mb_cache_destroy(struct mb_cache *cache)
}
kfree(cache->c_hash);
kfree(cache);
- module_put(THIS_MODULE);
}
EXPORT_SYMBOL(mb_cache_destroy);
@@ -420,7 +414,8 @@ static int __init mbcache_init(void)
mb_entry_cache = kmem_cache_create("mbcache",
sizeof(struct mb_cache_entry), 0,
SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL);
- BUG_ON(!mb_entry_cache);
+ if (!mb_entry_cache)
+ return -ENOMEM;
return 0;
}
diff --git a/fs/mpage.c b/fs/mpage.c
index d2413af0823a..28af984a3d96 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -489,7 +489,7 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
struct buffer_head map_bh;
loff_t i_size = i_size_read(inode);
int ret = 0;
- int op_flags = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : 0);
+ int op_flags = wbc_to_write_flags(wbc);
if (page_has_buffers(page)) {
struct buffer_head *head = page_buffers(page);
@@ -555,8 +555,7 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
if (mpd->get_block(inode, block_in_file, &map_bh, 1))
goto confused;
if (buffer_new(&map_bh))
- unmap_underlying_metadata(map_bh.b_bdev,
- map_bh.b_blocknr);
+ clean_bdev_bh_alias(&map_bh);
if (buffer_boundary(&map_bh)) {
boundary_block = map_bh.b_blocknr;
boundary_bdev = map_bh.b_bdev;
@@ -705,7 +704,7 @@ mpage_writepages(struct address_space *mapping,
ret = write_cache_pages(mapping, wbc, __mpage_writepage, &mpd);
if (mpd.bio) {
int op_flags = (wbc->sync_mode == WB_SYNC_ALL ?
- WRITE_SYNC : 0);
+ REQ_SYNC : 0);
mpage_bio_submit(REQ_OP_WRITE, op_flags, mpd.bio);
}
}
@@ -726,7 +725,7 @@ int mpage_writepage(struct page *page, get_block_t get_block,
int ret = __mpage_writepage(page, wbc, &mpd);
if (mpd.bio) {
int op_flags = (wbc->sync_mode == WB_SYNC_ALL ?
- WRITE_SYNC : 0);
+ REQ_SYNC : 0);
mpage_bio_submit(REQ_OP_WRITE, op_flags, mpd.bio);
}
return ret;
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index e9aa235e9d10..f073a6d2c6a5 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -110,20 +110,52 @@ out:
#if defined(CONFIG_NFS_V4_1)
/*
- * Lookup a layout by filehandle.
+ * Lookup a layout inode by stateid
*
- * Note: gets a refcount on the layout hdr and on its respective inode.
- * Caller must put the layout hdr and the inode.
+ * Note: returns a refcount on the inode and superblock
+ */
+static struct inode *nfs_layout_find_inode_by_stateid(struct nfs_client *clp,
+ const nfs4_stateid *stateid)
+{
+ struct nfs_server *server;
+ struct inode *inode;
+ struct pnfs_layout_hdr *lo;
+
+restart:
+ list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
+ list_for_each_entry(lo, &server->layouts, plh_layouts) {
+ if (stateid != NULL &&
+ !nfs4_stateid_match_other(stateid, &lo->plh_stateid))
+ continue;
+ inode = igrab(lo->plh_inode);
+ if (!inode)
+ continue;
+ if (!nfs_sb_active(inode->i_sb)) {
+ rcu_read_lock();
+ spin_unlock(&clp->cl_lock);
+ iput(inode);
+ spin_lock(&clp->cl_lock);
+ goto restart;
+ }
+ return inode;
+ }
+ }
+
+ return NULL;
+}
+
+/*
+ * Lookup a layout inode by filehandle.
+ *
+ * Note: returns a refcount on the inode and superblock
*
- * TODO: keep track of all layouts (and delegations) in a hash table
- * hashed by filehandle.
*/
-static struct pnfs_layout_hdr * get_layout_by_fh_locked(struct nfs_client *clp,
- struct nfs_fh *fh)
+static struct inode *nfs_layout_find_inode_by_fh(struct nfs_client *clp,
+ const struct nfs_fh *fh)
{
struct nfs_server *server;
struct nfs_inode *nfsi;
- struct inode *ino;
+ struct inode *inode;
struct pnfs_layout_hdr *lo;
restart:
@@ -134,37 +166,38 @@ restart:
continue;
if (nfsi->layout != lo)
continue;
- ino = igrab(lo->plh_inode);
- if (!ino)
- break;
- spin_lock(&ino->i_lock);
- /* Is this layout in the process of being freed? */
- if (nfsi->layout != lo) {
- spin_unlock(&ino->i_lock);
- iput(ino);
+ inode = igrab(lo->plh_inode);
+ if (!inode)
+ continue;
+ if (!nfs_sb_active(inode->i_sb)) {
+ rcu_read_lock();
+ spin_unlock(&clp->cl_lock);
+ iput(inode);
+ spin_lock(&clp->cl_lock);
goto restart;
}
- pnfs_get_layout_hdr(lo);
- spin_unlock(&ino->i_lock);
- return lo;
+ return inode;
}
}
return NULL;
}
-static struct pnfs_layout_hdr * get_layout_by_fh(struct nfs_client *clp,
- struct nfs_fh *fh)
+static struct inode *nfs_layout_find_inode(struct nfs_client *clp,
+ const struct nfs_fh *fh,
+ const nfs4_stateid *stateid)
{
- struct pnfs_layout_hdr *lo;
+ struct inode *inode;
spin_lock(&clp->cl_lock);
rcu_read_lock();
- lo = get_layout_by_fh_locked(clp, fh);
+ inode = nfs_layout_find_inode_by_stateid(clp, stateid);
+ if (!inode)
+ inode = nfs_layout_find_inode_by_fh(clp, fh);
rcu_read_unlock();
spin_unlock(&clp->cl_lock);
- return lo;
+ return inode;
}
/*
@@ -213,18 +246,20 @@ static u32 initiate_file_draining(struct nfs_client *clp,
u32 rv = NFS4ERR_NOMATCHING_LAYOUT;
LIST_HEAD(free_me_list);
- lo = get_layout_by_fh(clp, &args->cbl_fh);
- if (!lo) {
- trace_nfs4_cb_layoutrecall_file(clp, &args->cbl_fh, NULL,
- &args->cbl_stateid, -rv);
+ ino = nfs_layout_find_inode(clp, &args->cbl_fh, &args->cbl_stateid);
+ if (!ino)
goto out;
- }
- ino = lo->plh_inode;
pnfs_layoutcommit_inode(ino, false);
spin_lock(&ino->i_lock);
+ lo = NFS_I(ino)->layout;
+ if (!lo) {
+ spin_unlock(&ino->i_lock);
+ goto out;
+ }
+ pnfs_get_layout_hdr(lo);
rv = pnfs_check_callback_stateid(lo, &args->cbl_stateid);
if (rv != NFS_OK)
goto unlock;
@@ -258,10 +293,10 @@ unlock:
/* Free all lsegs that are attached to commit buckets */
nfs_commit_inode(ino, 0);
pnfs_put_layout_hdr(lo);
+out:
trace_nfs4_cb_layoutrecall_file(clp, &args->cbl_fh, ino,
&args->cbl_stateid, -rv);
- iput(ino);
-out:
+ nfs_iput_and_deactive(ino);
return rv;
}
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index ebecfb8fba06..91a8d610ba0f 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -369,9 +369,7 @@ nfs_found_client(const struct nfs_client_initdata *cl_init,
* Look up a client by IP address and protocol version
* - creates a new record if one doesn't yet exist
*/
-struct nfs_client *
-nfs_get_client(const struct nfs_client_initdata *cl_init,
- rpc_authflavor_t authflavour)
+struct nfs_client *nfs_get_client(const struct nfs_client_initdata *cl_init)
{
struct nfs_client *clp, *new = NULL;
struct nfs_net *nn = net_generic(cl_init->net, nfs_net_id);
@@ -655,7 +653,7 @@ static int nfs_init_server(struct nfs_server *server,
set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags);
/* Allocate or find a client reference we can use */
- clp = nfs_get_client(&cl_init, RPC_AUTH_UNIX);
+ clp = nfs_get_client(&cl_init);
if (IS_ERR(clp)) {
dprintk("<-- nfs_init_server() = error %ld\n", PTR_ERR(clp));
return PTR_ERR(clp);
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index dff600ae0d74..d7df5e67b0c1 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -391,10 +391,6 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
rcu_assign_pointer(nfsi->delegation, delegation);
delegation = NULL;
- /* Ensure we revalidate the attributes and page cache! */
- spin_lock(&inode->i_lock);
- nfsi->cache_validity |= NFS_INO_REVAL_FORCED;
- spin_unlock(&inode->i_lock);
trace_nfs4_set_delegation(inode, res->delegation_type);
out:
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 5f1af4cd1a33..cb22a9f9ae7e 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -455,14 +455,17 @@ bool nfs_use_readdirplus(struct inode *dir, struct dir_context *ctx)
}
/*
- * This function is called by the lookup code to request the use of
- * readdirplus to accelerate any future lookups in the same
+ * This function is called by the lookup and getattr code to request the
+ * use of readdirplus to accelerate any future lookups in the same
* directory.
*/
-static
void nfs_advise_use_readdirplus(struct inode *dir)
{
- set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(dir)->flags);
+ struct nfs_inode *nfsi = NFS_I(dir);
+
+ if (nfs_server_capable(dir, NFS_CAP_READDIRPLUS) &&
+ !list_empty(&nfsi->open_files))
+ set_bit(NFS_INO_ADVISE_RDPLUS, &nfsi->flags);
}
/*
@@ -475,9 +478,12 @@ void nfs_advise_use_readdirplus(struct inode *dir)
*/
void nfs_force_use_readdirplus(struct inode *dir)
{
- if (!list_empty(&NFS_I(dir)->open_files)) {
- nfs_advise_use_readdirplus(dir);
- nfs_zap_mapping(dir, dir->i_mapping);
+ struct nfs_inode *nfsi = NFS_I(dir);
+
+ if (nfs_server_capable(dir, NFS_CAP_READDIRPLUS) &&
+ !list_empty(&nfsi->open_files)) {
+ set_bit(NFS_INO_ADVISE_RDPLUS, &nfsi->flags);
+ invalidate_mapping_pages(dir->i_mapping, 0, -1);
}
}
@@ -886,17 +892,6 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc)
goto out;
}
-static bool nfs_dir_mapping_need_revalidate(struct inode *dir)
-{
- struct nfs_inode *nfsi = NFS_I(dir);
-
- if (nfs_attribute_cache_expired(dir))
- return true;
- if (nfsi->cache_validity & NFS_INO_INVALID_DATA)
- return true;
- return false;
-}
-
/* The file offset position represents the dirent entry number. A
last cookie cache takes care of the common case of reading the
whole directory.
@@ -928,7 +923,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
desc->decode = NFS_PROTO(inode)->decode_dirent;
desc->plus = nfs_use_readdirplus(inode, ctx) ? 1 : 0;
- if (ctx->pos == 0 || nfs_dir_mapping_need_revalidate(inode))
+ if (ctx->pos == 0 || nfs_attribute_cache_expired(inode))
res = nfs_revalidate_mapping(inode, file->f_mapping);
if (res < 0)
goto out;
@@ -1035,8 +1030,6 @@ EXPORT_SYMBOL_GPL(nfs_force_lookup_revalidate);
static int nfs_check_verifier(struct inode *dir, struct dentry *dentry,
int rcu_walk)
{
- int ret;
-
if (IS_ROOT(dentry))
return 1;
if (NFS_SERVER(dir)->flags & NFS_MOUNT_LOOKUP_CACHE_NONE)
@@ -1044,12 +1037,12 @@ static int nfs_check_verifier(struct inode *dir, struct dentry *dentry,
if (!nfs_verify_change_attribute(dir, dentry->d_time))
return 0;
/* Revalidate nfsi->cache_change_attribute before we declare a match */
- if (rcu_walk)
- ret = nfs_revalidate_inode_rcu(NFS_SERVER(dir), dir);
- else
- ret = nfs_revalidate_inode(NFS_SERVER(dir), dir);
- if (ret < 0)
- return 0;
+ if (nfs_mapping_need_revalidate_inode(dir)) {
+ if (rcu_walk)
+ return 0;
+ if (__nfs_revalidate_inode(NFS_SERVER(dir), dir) < 0)
+ return 0;
+ }
if (!nfs_verify_change_attribute(dir, dentry->d_time))
return 0;
return 1;
@@ -1161,7 +1154,7 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
return -ECHILD;
goto out_bad;
}
- goto out_valid_noent;
+ goto out_valid;
}
if (is_bad_inode(inode)) {
@@ -1184,6 +1177,7 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
return -ECHILD;
goto out_zap_parent;
}
+ nfs_advise_use_readdirplus(dir);
goto out_valid;
}
@@ -1219,12 +1213,12 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
nfs_free_fhandle(fhandle);
nfs4_label_free(label);
+ /* set a readdirplus hint that we had a cache miss */
+ nfs_force_use_readdirplus(dir);
+
out_set_verifier:
nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
out_valid:
- /* Success: notify readdir to use READDIRPLUS */
- nfs_advise_use_readdirplus(dir);
- out_valid_noent:
if (flags & LOOKUP_RCU) {
if (parent != ACCESS_ONCE(dentry->d_parent))
return -ECHILD;
@@ -1424,8 +1418,8 @@ struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned in
if (IS_ERR(res))
goto out_label;
- /* Success: notify readdir to use READDIRPLUS */
- nfs_advise_use_readdirplus(dir);
+ /* Notify readdir to use READDIRPLUS */
+ nfs_force_use_readdirplus(dir);
no_entry:
res = d_splice_alias(inode, dentry);
@@ -1467,9 +1461,9 @@ static fmode_t flags_to_mode(int flags)
return res;
}
-static struct nfs_open_context *create_nfs_open_context(struct dentry *dentry, int open_flags)
+static struct nfs_open_context *create_nfs_open_context(struct dentry *dentry, int open_flags, struct file *filp)
{
- return alloc_nfs_open_context(dentry, flags_to_mode(open_flags));
+ return alloc_nfs_open_context(dentry, flags_to_mode(open_flags), filp);
}
static int do_open(struct inode *inode, struct file *filp)
@@ -1535,8 +1529,13 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry,
return -ENAMETOOLONG;
if (open_flags & O_CREAT) {
+ struct nfs_server *server = NFS_SERVER(dir);
+
+ if (!(server->attr_bitmask[2] & FATTR4_WORD2_MODE_UMASK))
+ mode &= ~current_umask();
+
attr.ia_valid |= ATTR_MODE;
- attr.ia_mode = mode & ~current_umask();
+ attr.ia_mode = mode;
}
if (open_flags & O_TRUNC) {
attr.ia_valid |= ATTR_SIZE;
@@ -1554,7 +1553,7 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry,
return finish_no_open(file, dentry);
}
- ctx = create_nfs_open_context(dentry, open_flags);
+ ctx = create_nfs_open_context(dentry, open_flags, file);
err = PTR_ERR(ctx);
if (IS_ERR(ctx))
goto out;
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index bd81bcf3ffcf..be88bcdca692 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -105,7 +105,7 @@ struct nfs_direct_req {
static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops;
static const struct nfs_commit_completion_ops nfs_direct_commit_completion_ops;
-static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode);
+static void nfs_direct_write_complete(struct nfs_direct_req *dreq);
static void nfs_direct_write_schedule_work(struct work_struct *work);
static inline void get_dreq(struct nfs_direct_req *dreq)
@@ -684,7 +684,7 @@ out_failed:
}
if (put_dreq(dreq))
- nfs_direct_write_complete(dreq, dreq->inode);
+ nfs_direct_write_complete(dreq);
}
static void nfs_direct_commit_complete(struct nfs_commit_data *data)
@@ -717,7 +717,7 @@ static void nfs_direct_commit_complete(struct nfs_commit_data *data)
}
if (atomic_dec_and_test(&cinfo.mds->rpcs_out))
- nfs_direct_write_complete(dreq, data->inode);
+ nfs_direct_write_complete(dreq);
}
static void nfs_direct_resched_write(struct nfs_commit_info *cinfo,
@@ -768,7 +768,7 @@ static void nfs_direct_write_schedule_work(struct work_struct *work)
}
}
-static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode)
+static void nfs_direct_write_complete(struct nfs_direct_req *dreq)
{
schedule_work(&dreq->work); /* Calls nfs_direct_write_schedule_work */
}
@@ -824,7 +824,7 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
out_put:
if (put_dreq(dreq))
- nfs_direct_write_complete(dreq, hdr->inode);
+ nfs_direct_write_complete(dreq);
hdr->release(hdr);
}
@@ -953,7 +953,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
}
if (put_dreq(dreq))
- nfs_direct_write_complete(dreq, dreq->inode);
+ nfs_direct_write_complete(dreq);
return 0;
}
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 9ea85ae23c32..64c11f399b3d 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -102,8 +102,11 @@ static int nfs_revalidate_file_size(struct inode *inode, struct file *filp)
{
struct nfs_server *server = NFS_SERVER(inode);
struct nfs_inode *nfsi = NFS_I(inode);
+ const unsigned long force_reval = NFS_INO_REVAL_PAGECACHE|NFS_INO_REVAL_FORCED;
+ unsigned long cache_validity = nfsi->cache_validity;
- if (nfs_have_delegated_attributes(inode))
+ if (NFS_PROTO(inode)->have_delegation(inode, FMODE_READ) &&
+ (cache_validity & force_reval) != force_reval)
goto out_noreval;
if (filp->f_flags & O_DIRECT)
diff --git a/fs/nfs/filelayout/filelayoutdev.c b/fs/nfs/filelayout/filelayoutdev.c
index 4946ef40ba87..a5589b791439 100644
--- a/fs/nfs/filelayout/filelayoutdev.c
+++ b/fs/nfs/filelayout/filelayoutdev.c
@@ -279,8 +279,7 @@ nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx)
nfs4_pnfs_ds_connect(s, ds, devid, dataserver_timeo,
dataserver_retrans, 4,
- s->nfs_client->cl_minorversion,
- s->nfs_client->cl_rpcclient->cl_auth->au_flavor);
+ s->nfs_client->cl_minorversion);
out_test_devid:
if (filelayout_test_devid_unavailable(devid))
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index 98ace127bf86..9e111d07f667 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -25,9 +25,20 @@
#define NFSDBG_FACILITY NFSDBG_PNFS_LD
#define FF_LAYOUT_POLL_RETRY_MAX (15*HZ)
+#define FF_LAYOUTRETURN_MAXERR 20
+
static struct group_info *ff_zero_group;
+static void ff_layout_read_record_layoutstats_done(struct rpc_task *task,
+ struct nfs_pgio_header *hdr);
+static int ff_layout_mirror_prepare_stats(struct pnfs_layout_hdr *lo,
+ struct nfs42_layoutstat_devinfo *devinfo,
+ int dev_limit);
+static void ff_layout_encode_ff_layoutupdate(struct xdr_stream *xdr,
+ const struct nfs42_layoutstat_devinfo *devinfo,
+ struct nfs4_ff_layout_mirror *mirror);
+
static struct pnfs_layout_hdr *
ff_layout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags)
{
@@ -172,7 +183,7 @@ ff_layout_add_mirror(struct pnfs_layout_hdr *lo,
spin_lock(&inode->i_lock);
list_for_each_entry(pos, &ff_layout->mirrors, mirrors) {
- if (mirror->mirror_ds != pos->mirror_ds)
+ if (memcmp(&mirror->devid, &pos->devid, sizeof(pos->devid)) != 0)
continue;
if (!ff_mirror_match_fh(mirror, pos))
continue;
@@ -349,19 +360,6 @@ static void ff_layout_sort_mirrors(struct nfs4_ff_layout_segment *fls)
}
}
-static void ff_layout_mark_devices_valid(struct nfs4_ff_layout_segment *fls)
-{
- struct nfs4_deviceid_node *node;
- int i;
-
- if (!(fls->flags & FF_FLAGS_NO_IO_THRU_MDS))
- return;
- for (i = 0; i < fls->mirror_array_cnt; i++) {
- node = &fls->mirror_array[i]->mirror_ds->id_node;
- clear_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags);
- }
-}
-
static struct pnfs_layout_segment *
ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
struct nfs4_layoutget_res *lgr,
@@ -415,8 +413,6 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
for (i = 0; i < fls->mirror_array_cnt; i++) {
struct nfs4_ff_layout_mirror *mirror;
- struct nfs4_deviceid devid;
- struct nfs4_deviceid_node *idnode;
struct auth_cred acred = { .group_info = ff_zero_group };
struct rpc_cred __rcu *cred;
u32 ds_count, fh_count, id;
@@ -441,24 +437,10 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
fls->mirror_array[i]->ds_count = ds_count;
/* deviceid */
- rc = decode_deviceid(&stream, &devid);
+ rc = decode_deviceid(&stream, &fls->mirror_array[i]->devid);
if (rc)
goto out_err_free;
- idnode = nfs4_find_get_deviceid(NFS_SERVER(lh->plh_inode),
- &devid, lh->plh_lc_cred,
- gfp_flags);
- /*
- * upon success, mirror_ds is allocated by previous
- * getdeviceinfo, or newly by .alloc_deviceid_node
- * nfs4_find_get_deviceid failure is indeed getdeviceinfo falure
- */
- if (idnode)
- fls->mirror_array[i]->mirror_ds =
- FF_LAYOUT_MIRROR_DS(idnode);
- else
- goto out_err_free;
-
/* efficiency */
rc = -EIO;
p = xdr_inline_decode(&stream, 4);
@@ -556,8 +538,6 @@ out_sort_mirrors:
rc = ff_layout_check_layout(lgr);
if (rc)
goto out_err_free;
- ff_layout_mark_devices_valid(fls);
-
ret = &fls->generic_hdr;
dprintk("<-- %s (success)\n", __func__);
out_free_page:
@@ -702,6 +682,7 @@ nfs4_ff_layout_stat_io_start_read(struct inode *inode,
spin_lock(&mirror->lock);
report = nfs4_ff_layoutstat_start_io(mirror, &mirror->read_stat, now);
nfs4_ff_layout_stat_io_update_requested(&mirror->read_stat, requested);
+ set_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags);
spin_unlock(&mirror->lock);
if (report)
@@ -718,6 +699,7 @@ nfs4_ff_layout_stat_io_end_read(struct rpc_task *task,
nfs4_ff_layout_stat_io_update_completed(&mirror->read_stat,
requested, completed,
ktime_get(), task->tk_start);
+ set_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags);
spin_unlock(&mirror->lock);
}
@@ -731,6 +713,7 @@ nfs4_ff_layout_stat_io_start_write(struct inode *inode,
spin_lock(&mirror->lock);
report = nfs4_ff_layoutstat_start_io(mirror , &mirror->write_stat, now);
nfs4_ff_layout_stat_io_update_requested(&mirror->write_stat, requested);
+ set_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags);
spin_unlock(&mirror->lock);
if (report)
@@ -750,6 +733,7 @@ nfs4_ff_layout_stat_io_end_write(struct rpc_task *task,
spin_lock(&mirror->lock);
nfs4_ff_layout_stat_io_update_completed(&mirror->write_stat,
requested, completed, ktime_get(), task->tk_start);
+ set_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags);
spin_unlock(&mirror->lock);
}
@@ -1293,6 +1277,7 @@ static int ff_layout_read_done_cb(struct rpc_task *task,
hdr->pgio_mirror_idx + 1,
&hdr->pgio_mirror_idx))
goto out_eagain;
+ ff_layout_read_record_layoutstats_done(task, hdr);
pnfs_read_resend_pnfs(hdr);
return task->tk_status;
case -NFS4ERR_RESET_TO_MDS:
@@ -1961,38 +1946,88 @@ ff_layout_free_deviceid_node(struct nfs4_deviceid_node *d)
id_node));
}
-static int ff_layout_encode_ioerr(struct nfs4_flexfile_layout *flo,
- struct xdr_stream *xdr,
- const struct nfs4_layoutreturn_args *args)
+static int ff_layout_encode_ioerr(struct xdr_stream *xdr,
+ const struct nfs4_layoutreturn_args *args,
+ const struct nfs4_flexfile_layoutreturn_args *ff_args)
{
- struct pnfs_layout_hdr *hdr = &flo->generic_hdr;
__be32 *start;
- int count = 0, ret = 0;
start = xdr_reserve_space(xdr, 4);
if (unlikely(!start))
return -E2BIG;
+ *start = cpu_to_be32(ff_args->num_errors);
/* This assume we always return _ALL_ layouts */
- spin_lock(&hdr->plh_inode->i_lock);
- ret = ff_layout_encode_ds_ioerr(flo, xdr, &count, &args->range);
- spin_unlock(&hdr->plh_inode->i_lock);
+ return ff_layout_encode_ds_ioerr(xdr, &ff_args->errors);
+}
- *start = cpu_to_be32(count);
+static void
+encode_opaque_fixed(struct xdr_stream *xdr, const void *buf, size_t len)
+{
+ __be32 *p;
- return ret;
+ p = xdr_reserve_space(xdr, len);
+ xdr_encode_opaque_fixed(p, buf, len);
+}
+
+static void
+ff_layout_encode_ff_iostat_head(struct xdr_stream *xdr,
+ const nfs4_stateid *stateid,
+ const struct nfs42_layoutstat_devinfo *devinfo)
+{
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, 8 + 8);
+ p = xdr_encode_hyper(p, devinfo->offset);
+ p = xdr_encode_hyper(p, devinfo->length);
+ encode_opaque_fixed(xdr, stateid->data, NFS4_STATEID_SIZE);
+ p = xdr_reserve_space(xdr, 4*8);
+ p = xdr_encode_hyper(p, devinfo->read_count);
+ p = xdr_encode_hyper(p, devinfo->read_bytes);
+ p = xdr_encode_hyper(p, devinfo->write_count);
+ p = xdr_encode_hyper(p, devinfo->write_bytes);
+ encode_opaque_fixed(xdr, devinfo->dev_id.data, NFS4_DEVICEID4_SIZE);
+}
+
+static void
+ff_layout_encode_ff_iostat(struct xdr_stream *xdr,
+ const nfs4_stateid *stateid,
+ const struct nfs42_layoutstat_devinfo *devinfo)
+{
+ ff_layout_encode_ff_iostat_head(xdr, stateid, devinfo);
+ ff_layout_encode_ff_layoutupdate(xdr, devinfo,
+ devinfo->ld_private.data);
}
/* report nothing for now */
-static void ff_layout_encode_iostats(struct nfs4_flexfile_layout *flo,
- struct xdr_stream *xdr,
- const struct nfs4_layoutreturn_args *args)
+static void ff_layout_encode_iostats_array(struct xdr_stream *xdr,
+ const struct nfs4_layoutreturn_args *args,
+ struct nfs4_flexfile_layoutreturn_args *ff_args)
{
__be32 *p;
+ int i;
p = xdr_reserve_space(xdr, 4);
- if (likely(p))
- *p = cpu_to_be32(0);
+ *p = cpu_to_be32(ff_args->num_dev);
+ for (i = 0; i < ff_args->num_dev; i++)
+ ff_layout_encode_ff_iostat(xdr,
+ &args->layout->plh_stateid,
+ &ff_args->devinfo[i]);
+}
+
+static void
+ff_layout_free_iostats_array(struct nfs42_layoutstat_devinfo *devinfo,
+ unsigned int num_entries)
+{
+ unsigned int i;
+
+ for (i = 0; i < num_entries; i++) {
+ if (!devinfo[i].ld_private.ops)
+ continue;
+ if (!devinfo[i].ld_private.ops->free)
+ continue;
+ devinfo[i].ld_private.ops->free(&devinfo[i].ld_private);
+ }
}
static struct nfs4_deviceid_node *
@@ -2008,24 +2043,91 @@ ff_layout_alloc_deviceid_node(struct nfs_server *server,
}
static void
-ff_layout_encode_layoutreturn(struct pnfs_layout_hdr *lo,
- struct xdr_stream *xdr,
- const struct nfs4_layoutreturn_args *args)
-{
- struct nfs4_flexfile_layout *flo = FF_LAYOUT_FROM_HDR(lo);
+ff_layout_encode_layoutreturn(struct xdr_stream *xdr,
+ const void *voidargs,
+ const struct nfs4_xdr_opaque_data *ff_opaque)
+{
+ const struct nfs4_layoutreturn_args *args = voidargs;
+ struct nfs4_flexfile_layoutreturn_args *ff_args = ff_opaque->data;
+ struct xdr_buf tmp_buf = {
+ .head = {
+ [0] = {
+ .iov_base = page_address(ff_args->pages[0]),
+ },
+ },
+ .buflen = PAGE_SIZE,
+ };
+ struct xdr_stream tmp_xdr;
__be32 *start;
dprintk("%s: Begin\n", __func__);
- start = xdr_reserve_space(xdr, 4);
- BUG_ON(!start);
- ff_layout_encode_ioerr(flo, xdr, args);
- ff_layout_encode_iostats(flo, xdr, args);
+ xdr_init_encode(&tmp_xdr, &tmp_buf, NULL);
+
+ ff_layout_encode_ioerr(&tmp_xdr, args, ff_args);
+ ff_layout_encode_iostats_array(&tmp_xdr, args, ff_args);
+
+ start = xdr_reserve_space(xdr, 4);
+ *start = cpu_to_be32(tmp_buf.len);
+ xdr_write_pages(xdr, ff_args->pages, 0, tmp_buf.len);
- *start = cpu_to_be32((xdr->p - start - 1) * 4);
dprintk("%s: Return\n", __func__);
}
+static void
+ff_layout_free_layoutreturn(struct nfs4_xdr_opaque_data *args)
+{
+ struct nfs4_flexfile_layoutreturn_args *ff_args;
+
+ if (!args->data)
+ return;
+ ff_args = args->data;
+ args->data = NULL;
+
+ ff_layout_free_ds_ioerr(&ff_args->errors);
+ ff_layout_free_iostats_array(ff_args->devinfo, ff_args->num_dev);
+
+ put_page(ff_args->pages[0]);
+ kfree(ff_args);
+}
+
+const struct nfs4_xdr_opaque_ops layoutreturn_ops = {
+ .encode = ff_layout_encode_layoutreturn,
+ .free = ff_layout_free_layoutreturn,
+};
+
+static int
+ff_layout_prepare_layoutreturn(struct nfs4_layoutreturn_args *args)
+{
+ struct nfs4_flexfile_layoutreturn_args *ff_args;
+ struct nfs4_flexfile_layout *ff_layout = FF_LAYOUT_FROM_HDR(args->layout);
+
+ ff_args = kmalloc(sizeof(*ff_args), GFP_KERNEL);
+ if (!ff_args)
+ goto out_nomem;
+ ff_args->pages[0] = alloc_page(GFP_KERNEL);
+ if (!ff_args->pages[0])
+ goto out_nomem_free;
+
+ INIT_LIST_HEAD(&ff_args->errors);
+ ff_args->num_errors = ff_layout_fetch_ds_ioerr(args->layout,
+ &args->range, &ff_args->errors,
+ FF_LAYOUTRETURN_MAXERR);
+
+ spin_lock(&args->inode->i_lock);
+ ff_args->num_dev = ff_layout_mirror_prepare_stats(&ff_layout->generic_hdr,
+ &ff_args->devinfo[0], ARRAY_SIZE(ff_args->devinfo));
+ spin_unlock(&args->inode->i_lock);
+
+ args->ld_private->ops = &layoutreturn_ops;
+ args->ld_private->data = ff_args;
+ return 0;
+out_nomem_free:
+ kfree(ff_args);
+out_nomem:
+ return -ENOMEM;
+}
+
static int
ff_layout_ntop4(const struct sockaddr *sap, char *buf, const size_t buflen)
{
@@ -2146,21 +2248,18 @@ ff_layout_encode_io_latency(struct xdr_stream *xdr,
}
static void
-ff_layout_encode_layoutstats(struct xdr_stream *xdr,
- struct nfs42_layoutstat_args *args,
- struct nfs42_layoutstat_devinfo *devinfo)
+ff_layout_encode_ff_layoutupdate(struct xdr_stream *xdr,
+ const struct nfs42_layoutstat_devinfo *devinfo,
+ struct nfs4_ff_layout_mirror *mirror)
{
- struct nfs4_ff_layout_mirror *mirror = devinfo->layout_private;
struct nfs4_pnfs_ds_addr *da;
struct nfs4_pnfs_ds *ds = mirror->mirror_ds->ds;
struct nfs_fh *fh = &mirror->fh_versions[0];
- __be32 *p, *start;
+ __be32 *p;
da = list_first_entry(&ds->ds_addrs, struct nfs4_pnfs_ds_addr, da_node);
dprintk("%s: DS %s: encoding address %s\n",
__func__, ds->ds_remotestr, da->da_remotestr);
- /* layoutupdate length */
- start = xdr_reserve_space(xdr, 4);
/* netaddr4 */
ff_layout_encode_netaddr(xdr, da);
/* nfs_fh4 */
@@ -2177,42 +2276,71 @@ ff_layout_encode_layoutstats(struct xdr_stream *xdr,
/* bool */
p = xdr_reserve_space(xdr, 4);
*p = cpu_to_be32(false);
+}
+
+static void
+ff_layout_encode_layoutstats(struct xdr_stream *xdr, const void *args,
+ const struct nfs4_xdr_opaque_data *opaque)
+{
+ struct nfs42_layoutstat_devinfo *devinfo = container_of(opaque,
+ struct nfs42_layoutstat_devinfo, ld_private);
+ __be32 *start;
+
+ /* layoutupdate length */
+ start = xdr_reserve_space(xdr, 4);
+ ff_layout_encode_ff_layoutupdate(xdr, devinfo, opaque->data);
*start = cpu_to_be32((xdr->p - start - 1) * 4);
}
+static void
+ff_layout_free_layoutstats(struct nfs4_xdr_opaque_data *opaque)
+{
+ struct nfs4_ff_layout_mirror *mirror = opaque->data;
+
+ ff_layout_put_mirror(mirror);
+}
+
+static const struct nfs4_xdr_opaque_ops layoutstat_ops = {
+ .encode = ff_layout_encode_layoutstats,
+ .free = ff_layout_free_layoutstats,
+};
+
static int
-ff_layout_mirror_prepare_stats(struct nfs42_layoutstat_args *args,
- struct pnfs_layout_hdr *lo,
+ff_layout_mirror_prepare_stats(struct pnfs_layout_hdr *lo,
+ struct nfs42_layoutstat_devinfo *devinfo,
int dev_limit)
{
struct nfs4_flexfile_layout *ff_layout = FF_LAYOUT_FROM_HDR(lo);
struct nfs4_ff_layout_mirror *mirror;
struct nfs4_deviceid_node *dev;
- struct nfs42_layoutstat_devinfo *devinfo;
int i = 0;
list_for_each_entry(mirror, &ff_layout->mirrors, mirrors) {
if (i >= dev_limit)
break;
- if (!mirror->mirror_ds)
+ if (IS_ERR_OR_NULL(mirror->mirror_ds))
+ continue;
+ if (!test_and_clear_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags))
continue;
/* mirror refcount put in cleanup_layoutstats */
if (!atomic_inc_not_zero(&mirror->ref))
continue;
dev = &mirror->mirror_ds->id_node;
- devinfo = &args->devinfo[i];
memcpy(&devinfo->dev_id, &dev->deviceid, NFS4_DEVICEID4_SIZE);
devinfo->offset = 0;
devinfo->length = NFS4_MAX_UINT64;
+ spin_lock(&mirror->lock);
devinfo->read_count = mirror->read_stat.io_stat.ops_completed;
devinfo->read_bytes = mirror->read_stat.io_stat.bytes_completed;
devinfo->write_count = mirror->write_stat.io_stat.ops_completed;
devinfo->write_bytes = mirror->write_stat.io_stat.bytes_completed;
+ spin_unlock(&mirror->lock);
devinfo->layout_type = LAYOUT_FLEX_FILES;
- devinfo->layoutstats_encode = ff_layout_encode_layoutstats;
- devinfo->layout_private = mirror;
+ devinfo->ld_private.ops = &layoutstat_ops;
+ devinfo->ld_private.data = mirror;
+ devinfo++;
i++;
}
return i;
@@ -2222,47 +2350,27 @@ static int
ff_layout_prepare_layoutstats(struct nfs42_layoutstat_args *args)
{
struct nfs4_flexfile_layout *ff_layout;
- struct nfs4_ff_layout_mirror *mirror;
- int dev_count = 0;
+ const int dev_count = PNFS_LAYOUTSTATS_MAXDEV;
- spin_lock(&args->inode->i_lock);
- ff_layout = FF_LAYOUT_FROM_HDR(NFS_I(args->inode)->layout);
- list_for_each_entry(mirror, &ff_layout->mirrors, mirrors) {
- if (atomic_read(&mirror->ref) != 0)
- dev_count ++;
- }
- spin_unlock(&args->inode->i_lock);
/* For now, send at most PNFS_LAYOUTSTATS_MAXDEV statistics */
- if (dev_count > PNFS_LAYOUTSTATS_MAXDEV) {
- dprintk("%s: truncating devinfo to limit (%d:%d)\n",
- __func__, dev_count, PNFS_LAYOUTSTATS_MAXDEV);
- dev_count = PNFS_LAYOUTSTATS_MAXDEV;
- }
args->devinfo = kmalloc_array(dev_count, sizeof(*args->devinfo), GFP_NOIO);
if (!args->devinfo)
return -ENOMEM;
spin_lock(&args->inode->i_lock);
- args->num_dev = ff_layout_mirror_prepare_stats(args,
- &ff_layout->generic_hdr, dev_count);
+ ff_layout = FF_LAYOUT_FROM_HDR(NFS_I(args->inode)->layout);
+ args->num_dev = ff_layout_mirror_prepare_stats(&ff_layout->generic_hdr,
+ &args->devinfo[0], dev_count);
spin_unlock(&args->inode->i_lock);
+ if (!args->num_dev) {
+ kfree(args->devinfo);
+ args->devinfo = NULL;
+ return -ENOENT;
+ }
return 0;
}
-static void
-ff_layout_cleanup_layoutstats(struct nfs42_layoutstat_data *data)
-{
- struct nfs4_ff_layout_mirror *mirror;
- int i;
-
- for (i = 0; i < data->args.num_dev; i++) {
- mirror = data->args.devinfo[i].layout_private;
- data->args.devinfo[i].layout_private = NULL;
- ff_layout_put_mirror(mirror);
- }
-}
-
static struct pnfs_layoutdriver_type flexfilelayout_type = {
.id = LAYOUT_FLEX_FILES,
.name = "LAYOUT_FLEX_FILES",
@@ -2284,10 +2392,9 @@ static struct pnfs_layoutdriver_type flexfilelayout_type = {
.read_pagelist = ff_layout_read_pagelist,
.write_pagelist = ff_layout_write_pagelist,
.alloc_deviceid_node = ff_layout_alloc_deviceid_node,
- .encode_layoutreturn = ff_layout_encode_layoutreturn,
+ .prepare_layoutreturn = ff_layout_prepare_layoutreturn,
.sync = pnfs_nfs_generic_sync,
.prepare_layoutstats = ff_layout_prepare_layoutstats,
- .cleanup_layoutstats = ff_layout_cleanup_layoutstats,
};
static int __init nfs4flexfilelayout_init(void)
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h
index 3ee0c9fcea76..f4f39b0ab09b 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.h
+++ b/fs/nfs/flexfilelayout/flexfilelayout.h
@@ -21,6 +21,7 @@
/* LAYOUTSTATS report interval in ms */
#define FF_LAYOUTSTATS_REPORT_INTERVAL (60000L)
+#define FF_LAYOUTSTATS_MAXDEV 4
struct nfs4_ff_ds_version {
u32 version;
@@ -73,6 +74,7 @@ struct nfs4_ff_layout_mirror {
struct list_head mirrors;
u32 ds_count;
u32 efficiency;
+ struct nfs4_deviceid devid;
struct nfs4_ff_layout_ds *mirror_ds;
u32 fh_versions_cnt;
struct nfs_fh *fh_versions;
@@ -81,12 +83,15 @@ struct nfs4_ff_layout_mirror {
struct rpc_cred __rcu *rw_cred;
atomic_t ref;
spinlock_t lock;
+ unsigned long flags;
struct nfs4_ff_layoutstat read_stat;
struct nfs4_ff_layoutstat write_stat;
ktime_t start_time;
u32 report_interval;
};
+#define NFS4_FF_MIRROR_STAT_AVAIL (0)
+
struct nfs4_ff_layout_segment {
struct pnfs_layout_segment generic_hdr;
u64 stripe_unit;
@@ -103,6 +108,14 @@ struct nfs4_flexfile_layout {
ktime_t last_report_time; /* Layoutstat report times */
};
+struct nfs4_flexfile_layoutreturn_args {
+ struct list_head errors;
+ struct nfs42_layoutstat_devinfo devinfo[FF_LAYOUTSTATS_MAXDEV];
+ unsigned int num_errors;
+ unsigned int num_dev;
+ struct page *pages[1];
+};
+
static inline struct nfs4_flexfile_layout *
FF_LAYOUT_FROM_HDR(struct pnfs_layout_hdr *lo)
{
@@ -180,9 +193,12 @@ int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo,
struct nfs4_ff_layout_mirror *mirror, u64 offset,
u64 length, int status, enum nfs_opnum4 opnum,
gfp_t gfp_flags);
-int ff_layout_encode_ds_ioerr(struct nfs4_flexfile_layout *flo,
- struct xdr_stream *xdr, int *count,
- const struct pnfs_layout_range *range);
+int ff_layout_encode_ds_ioerr(struct xdr_stream *xdr, const struct list_head *head);
+void ff_layout_free_ds_ioerr(struct list_head *head);
+unsigned int ff_layout_fetch_ds_ioerr(struct pnfs_layout_hdr *lo,
+ const struct pnfs_layout_range *range,
+ struct list_head *head,
+ unsigned int maxnum);
struct nfs_fh *
nfs4_ff_layout_select_ds_fh(struct pnfs_layout_segment *lseg, u32 mirror_idx);
@@ -197,7 +213,6 @@ nfs4_ff_find_or_create_ds_client(struct pnfs_layout_segment *lseg,
struct inode *inode);
struct rpc_cred *ff_layout_get_ds_cred(struct pnfs_layout_segment *lseg,
u32 ds_idx, struct rpc_cred *mdscred);
-bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg);
bool ff_layout_avoid_mds_available_ds(struct pnfs_layout_segment *lseg);
bool ff_layout_avoid_read_on_rw(struct pnfs_layout_segment *lseg);
diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
index f7a3f6b05369..3cc39d1c1206 100644
--- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c
+++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
@@ -20,9 +20,11 @@
static unsigned int dataserver_timeo = NFS_DEF_TCP_RETRANS;
static unsigned int dataserver_retrans;
+static bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg);
+
void nfs4_ff_layout_put_deviceid(struct nfs4_ff_layout_ds *mirror_ds)
{
- if (mirror_ds)
+ if (!IS_ERR_OR_NULL(mirror_ds))
nfs4_put_deviceid_node(&mirror_ds->id_node);
}
@@ -182,12 +184,29 @@ static void ff_layout_mark_devid_invalid(struct pnfs_layout_segment *lseg,
}
static bool ff_layout_mirror_valid(struct pnfs_layout_segment *lseg,
- struct nfs4_ff_layout_mirror *mirror)
+ struct nfs4_ff_layout_mirror *mirror,
+ bool create)
{
- if (mirror == NULL || mirror->mirror_ds == NULL) {
- pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode,
- lseg);
- return false;
+ if (mirror == NULL || IS_ERR(mirror->mirror_ds))
+ goto outerr;
+ if (mirror->mirror_ds == NULL) {
+ if (create) {
+ struct nfs4_deviceid_node *node;
+ struct pnfs_layout_hdr *lh = lseg->pls_layout;
+ struct nfs4_ff_layout_ds *mirror_ds = ERR_PTR(-ENODEV);
+
+ node = nfs4_find_get_deviceid(NFS_SERVER(lh->plh_inode),
+ &mirror->devid, lh->plh_lc_cred,
+ GFP_KERNEL);
+ if (node)
+ mirror_ds = FF_LAYOUT_MIRROR_DS(node);
+
+ /* check for race with another call to this function */
+ if (cmpxchg(&mirror->mirror_ds, NULL, mirror_ds) &&
+ mirror_ds != ERR_PTR(-ENODEV))
+ nfs4_put_deviceid_node(node);
+ } else
+ goto outerr;
}
if (mirror->mirror_ds->ds == NULL) {
struct nfs4_deviceid_node *devid;
@@ -196,15 +215,9 @@ static bool ff_layout_mirror_valid(struct pnfs_layout_segment *lseg,
return false;
}
return true;
-}
-
-static u64
-end_offset(u64 start, u64 len)
-{
- u64 end;
-
- end = start + len;
- return end >= start ? end : NFS4_MAX_UINT64;
+outerr:
+ pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode, lseg);
+ return false;
}
static void extend_ds_error(struct nfs4_ff_layout_ds_err *err,
@@ -212,8 +225,8 @@ static void extend_ds_error(struct nfs4_ff_layout_ds_err *err,
{
u64 end;
- end = max_t(u64, end_offset(err->offset, err->length),
- end_offset(offset, length));
+ end = max_t(u64, pnfs_end_offset(err->offset, err->length),
+ pnfs_end_offset(offset, length));
err->offset = min_t(u64, err->offset, offset);
err->length = end - err->offset;
}
@@ -235,9 +248,9 @@ ff_ds_error_match(const struct nfs4_ff_layout_ds_err *e1,
ret = memcmp(&e1->deviceid, &e2->deviceid, sizeof(e1->deviceid));
if (ret != 0)
return ret;
- if (end_offset(e1->offset, e1->length) < e2->offset)
+ if (pnfs_end_offset(e1->offset, e1->length) < e2->offset)
return -1;
- if (e1->offset > end_offset(e2->offset, e2->length))
+ if (e1->offset > pnfs_end_offset(e2->offset, e2->length))
return 1;
/* If ranges overlap or are contiguous, they are the same */
return 0;
@@ -263,8 +276,9 @@ ff_layout_add_ds_error_locked(struct nfs4_flexfile_layout *flo,
}
/* Entries match, so merge "err" into "dserr" */
extend_ds_error(dserr, err->offset, err->length);
- list_del(&err->list);
+ list_replace(&err->list, &dserr->list);
kfree(err);
+ return;
}
list_add_tail(&dserr->list, head);
@@ -331,7 +345,7 @@ nfs4_ff_layout_select_ds_fh(struct pnfs_layout_segment *lseg, u32 mirror_idx)
struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, mirror_idx);
struct nfs_fh *fh = NULL;
- if (!ff_layout_mirror_valid(lseg, mirror)) {
+ if (!ff_layout_mirror_valid(lseg, mirror, false)) {
pr_err_ratelimited("NFS: %s: No data server for mirror offset index %d\n",
__func__, mirror_idx);
goto out;
@@ -371,7 +385,7 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
struct nfs_server *s = NFS_SERVER(ino);
unsigned int max_payload;
- if (!ff_layout_mirror_valid(lseg, mirror)) {
+ if (!ff_layout_mirror_valid(lseg, mirror, true)) {
pr_err_ratelimited("NFS: %s: No data server for offset index %d\n",
__func__, ds_idx);
goto out;
@@ -393,8 +407,7 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
nfs4_pnfs_ds_connect(s, ds, devid, dataserver_timeo,
dataserver_retrans,
mirror->mirror_ds->ds_versions[0].version,
- mirror->mirror_ds->ds_versions[0].minor_version,
- RPC_AUTH_UNIX);
+ mirror->mirror_ds->ds_versions[0].minor_version);
/* connect success, check rsize/wsize limit */
if (ds->ds_clp) {
@@ -457,28 +470,26 @@ nfs4_ff_find_or_create_ds_client(struct pnfs_layout_segment *lseg, u32 ds_idx,
}
}
-static bool is_range_intersecting(u64 offset1, u64 length1,
- u64 offset2, u64 length2)
+void ff_layout_free_ds_ioerr(struct list_head *head)
{
- u64 end1 = end_offset(offset1, length1);
- u64 end2 = end_offset(offset2, length2);
+ struct nfs4_ff_layout_ds_err *err;
- return (end1 == NFS4_MAX_UINT64 || end1 > offset2) &&
- (end2 == NFS4_MAX_UINT64 || end2 > offset1);
+ while (!list_empty(head)) {
+ err = list_first_entry(head,
+ struct nfs4_ff_layout_ds_err,
+ list);
+ list_del(&err->list);
+ kfree(err);
+ }
}
/* called with inode i_lock held */
-int ff_layout_encode_ds_ioerr(struct nfs4_flexfile_layout *flo,
- struct xdr_stream *xdr, int *count,
- const struct pnfs_layout_range *range)
+int ff_layout_encode_ds_ioerr(struct xdr_stream *xdr, const struct list_head *head)
{
- struct nfs4_ff_layout_ds_err *err, *n;
+ struct nfs4_ff_layout_ds_err *err;
__be32 *p;
- list_for_each_entry_safe(err, n, &flo->error_list, list) {
- if (!is_range_intersecting(err->offset, err->length,
- range->offset, range->length))
- continue;
+ list_for_each_entry(err, head, list) {
/* offset(8) + length(8) + stateid(NFS4_STATEID_SIZE)
* + array length + deviceid(NFS4_DEVICEID4_SIZE)
* + status(4) + opnum(4)
@@ -497,17 +508,59 @@ int ff_layout_encode_ds_ioerr(struct nfs4_flexfile_layout *flo,
NFS4_DEVICEID4_SIZE);
*p++ = cpu_to_be32(err->status);
*p++ = cpu_to_be32(err->opnum);
- *count += 1;
- list_del(&err->list);
- dprintk("%s: offset %llu length %llu status %d op %d count %d\n",
+ dprintk("%s: offset %llu length %llu status %d op %d\n",
__func__, err->offset, err->length, err->status,
- err->opnum, *count);
- kfree(err);
+ err->opnum);
}
return 0;
}
+static
+unsigned int do_layout_fetch_ds_ioerr(struct pnfs_layout_hdr *lo,
+ const struct pnfs_layout_range *range,
+ struct list_head *head,
+ unsigned int maxnum)
+{
+ struct nfs4_flexfile_layout *flo = FF_LAYOUT_FROM_HDR(lo);
+ struct inode *inode = lo->plh_inode;
+ struct nfs4_ff_layout_ds_err *err, *n;
+ unsigned int ret = 0;
+
+ spin_lock(&inode->i_lock);
+ list_for_each_entry_safe(err, n, &flo->error_list, list) {
+ if (!pnfs_is_range_intersecting(err->offset,
+ pnfs_end_offset(err->offset, err->length),
+ range->offset,
+ pnfs_end_offset(range->offset, range->length)))
+ continue;
+ if (!maxnum)
+ break;
+ list_move(&err->list, head);
+ maxnum--;
+ ret++;
+ }
+ spin_unlock(&inode->i_lock);
+ return ret;
+}
+
+unsigned int ff_layout_fetch_ds_ioerr(struct pnfs_layout_hdr *lo,
+ const struct pnfs_layout_range *range,
+ struct list_head *head,
+ unsigned int maxnum)
+{
+ unsigned int ret;
+
+ ret = do_layout_fetch_ds_ioerr(lo, range, head, maxnum);
+ /* If we're over the max, discard all remaining entries */
+ if (ret == maxnum) {
+ LIST_HEAD(discard);
+ do_layout_fetch_ds_ioerr(lo, range, &discard, -1);
+ ff_layout_free_ds_ioerr(&discard);
+ }
+ return ret;
+}
+
static bool ff_read_layout_has_available_ds(struct pnfs_layout_segment *lseg)
{
struct nfs4_ff_layout_mirror *mirror;
@@ -516,7 +569,11 @@ static bool ff_read_layout_has_available_ds(struct pnfs_layout_segment *lseg)
for (idx = 0; idx < FF_LAYOUT_MIRROR_COUNT(lseg); idx++) {
mirror = FF_LAYOUT_COMP(lseg, idx);
- if (mirror && mirror->mirror_ds) {
+ if (mirror) {
+ if (!mirror->mirror_ds)
+ return true;
+ if (IS_ERR(mirror->mirror_ds))
+ continue;
devid = &mirror->mirror_ds->id_node;
if (!ff_layout_test_devid_unavailable(devid))
return true;
@@ -534,8 +591,10 @@ static bool ff_rw_layout_has_available_ds(struct pnfs_layout_segment *lseg)
for (idx = 0; idx < FF_LAYOUT_MIRROR_COUNT(lseg); idx++) {
mirror = FF_LAYOUT_COMP(lseg, idx);
- if (!mirror || !mirror->mirror_ds)
+ if (!mirror || IS_ERR(mirror->mirror_ds))
return false;
+ if (!mirror->mirror_ds)
+ continue;
devid = &mirror->mirror_ds->id_node;
if (ff_layout_test_devid_unavailable(devid))
return false;
@@ -544,7 +603,7 @@ static bool ff_rw_layout_has_available_ds(struct pnfs_layout_segment *lseg)
return FF_LAYOUT_MIRROR_COUNT(lseg) != 0;
}
-bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg)
+static bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg)
{
if (lseg->pls_range.iomode == IOMODE_READ)
return ff_read_layout_has_available_ds(lseg);
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index bf4ec5ecc97e..5864146e05e6 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -634,15 +634,28 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr,
}
EXPORT_SYMBOL_GPL(nfs_setattr_update_inode);
-static void nfs_request_parent_use_readdirplus(struct dentry *dentry)
+static void nfs_readdirplus_parent_cache_miss(struct dentry *dentry)
{
struct dentry *parent;
+ if (!nfs_server_capable(d_inode(dentry), NFS_CAP_READDIRPLUS))
+ return;
parent = dget_parent(dentry);
nfs_force_use_readdirplus(d_inode(parent));
dput(parent);
}
+static void nfs_readdirplus_parent_cache_hit(struct dentry *dentry)
+{
+ struct dentry *parent;
+
+ if (!nfs_server_capable(d_inode(dentry), NFS_CAP_READDIRPLUS))
+ return;
+ parent = dget_parent(dentry);
+ nfs_advise_use_readdirplus(d_inode(parent));
+ dput(parent);
+}
+
static bool nfs_need_revalidate_inode(struct inode *inode)
{
if (NFS_I(inode)->cache_validity &
@@ -683,10 +696,10 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
if (need_atime || nfs_need_revalidate_inode(inode)) {
struct nfs_server *server = NFS_SERVER(inode);
- if (server->caps & NFS_CAP_READDIRPLUS)
- nfs_request_parent_use_readdirplus(dentry);
+ nfs_readdirplus_parent_cache_miss(dentry);
err = __nfs_revalidate_inode(server, inode);
- }
+ } else
+ nfs_readdirplus_parent_cache_hit(dentry);
if (!err) {
generic_fillattr(inode, stat);
stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode));
@@ -702,8 +715,7 @@ EXPORT_SYMBOL_GPL(nfs_getattr);
static void nfs_init_lock_context(struct nfs_lock_context *l_ctx)
{
atomic_set(&l_ctx->count, 1);
- l_ctx->lockowner.l_owner = current->files;
- l_ctx->lockowner.l_pid = current->tgid;
+ l_ctx->lockowner = current->files;
INIT_LIST_HEAD(&l_ctx->list);
atomic_set(&l_ctx->io_count, 0);
}
@@ -714,9 +726,7 @@ static struct nfs_lock_context *__nfs_find_lock_context(struct nfs_open_context
struct nfs_lock_context *pos = head;
do {
- if (pos->lockowner.l_owner != current->files)
- continue;
- if (pos->lockowner.l_pid != current->tgid)
+ if (pos->lockowner != current->files)
continue;
atomic_inc(&pos->count);
return pos;
@@ -799,7 +809,9 @@ void nfs_close_context(struct nfs_open_context *ctx, int is_sync)
}
EXPORT_SYMBOL_GPL(nfs_close_context);
-struct nfs_open_context *alloc_nfs_open_context(struct dentry *dentry, fmode_t f_mode)
+struct nfs_open_context *alloc_nfs_open_context(struct dentry *dentry,
+ fmode_t f_mode,
+ struct file *filp)
{
struct nfs_open_context *ctx;
struct rpc_cred *cred = rpc_lookup_cred();
@@ -818,6 +830,7 @@ struct nfs_open_context *alloc_nfs_open_context(struct dentry *dentry, fmode_t f
ctx->mode = f_mode;
ctx->flags = 0;
ctx->error = 0;
+ ctx->flock_owner = (fl_owner_t)filp;
nfs_init_lock_context(&ctx->lock_context);
ctx->lock_context.open_context = ctx;
INIT_LIST_HEAD(&ctx->list);
@@ -942,7 +955,7 @@ int nfs_open(struct inode *inode, struct file *filp)
{
struct nfs_open_context *ctx;
- ctx = alloc_nfs_open_context(file_dentry(filp), filp->f_mode);
+ ctx = alloc_nfs_open_context(file_dentry(filp), filp->f_mode, filp);
if (IS_ERR(ctx))
return PTR_ERR(ctx);
nfs_file_set_open_context(filp, ctx);
@@ -1099,11 +1112,17 @@ static int nfs_invalidate_mapping(struct inode *inode, struct address_space *map
return 0;
}
-static bool nfs_mapping_need_revalidate_inode(struct inode *inode)
+bool nfs_mapping_need_revalidate_inode(struct inode *inode)
{
- if (nfs_have_delegated_attributes(inode))
- return false;
- return (NFS_I(inode)->cache_validity & NFS_INO_REVAL_PAGECACHE)
+ unsigned long cache_validity = NFS_I(inode)->cache_validity;
+
+ if (NFS_PROTO(inode)->have_delegation(inode, FMODE_READ)) {
+ const unsigned long force_reval =
+ NFS_INO_REVAL_PAGECACHE|NFS_INO_REVAL_FORCED;
+ return (cache_validity & force_reval) == force_reval;
+ }
+
+ return (cache_validity & NFS_INO_REVAL_PAGECACHE)
|| nfs_attribute_timeout(inode)
|| NFS_STALE(inode);
}
@@ -1317,7 +1336,7 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
invalid |= NFS_INO_INVALID_ATIME;
if (invalid != 0)
- nfs_set_cache_invalid(inode, invalid);
+ nfs_set_cache_invalid(inode, invalid | NFS_INO_REVAL_FORCED);
nfsi->read_cache_jiffies = fattr->time_start;
return 0;
@@ -2015,7 +2034,7 @@ static void nfsiod_stop(void)
destroy_workqueue(wq);
}
-int nfs_net_id;
+unsigned int nfs_net_id;
EXPORT_SYMBOL_GPL(nfs_net_id);
static int nfs_net_init(struct net *net)
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 80bcc0befb07..6b79c2ca9b9a 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -154,8 +154,7 @@ extern const struct rpc_program nfs_program;
extern void nfs_clients_init(struct net *net);
extern struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *);
int nfs_create_rpc_client(struct nfs_client *, const struct nfs_client_initdata *, rpc_authflavor_t);
-struct nfs_client *nfs_get_client(const struct nfs_client_initdata *,
- rpc_authflavor_t);
+struct nfs_client *nfs_get_client(const struct nfs_client_initdata *);
int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *, struct nfs_fattr *);
void nfs_server_insert_lists(struct nfs_server *);
void nfs_server_remove_lists(struct nfs_server *);
@@ -194,14 +193,13 @@ extern struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv,
int ds_addrlen, int ds_proto,
unsigned int ds_timeo,
unsigned int ds_retrans,
- u32 minor_version,
- rpc_authflavor_t au_flavor);
+ u32 minor_version);
extern struct rpc_clnt *nfs4_find_or_create_ds_client(struct nfs_client *,
struct inode *);
extern struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv,
const struct sockaddr *ds_addr, int ds_addrlen,
int ds_proto, unsigned int ds_timeo,
- unsigned int ds_retrans, rpc_authflavor_t au_flavor);
+ unsigned int ds_retrans);
#ifdef CONFIG_PROC_FS
extern int __init nfs_fs_proc_init(void);
extern void nfs_fs_proc_exit(void);
@@ -346,6 +344,7 @@ extern struct nfs_client *nfs_init_client(struct nfs_client *clp,
const struct nfs_client_initdata *);
/* dir.c */
+extern void nfs_advise_use_readdirplus(struct inode *dir);
extern void nfs_force_use_readdirplus(struct inode *dir);
extern unsigned long nfs_access_cache_count(struct shrinker *shrink,
struct shrink_control *sc);
diff --git a/fs/nfs/netns.h b/fs/nfs/netns.h
index fbce0d885d4c..5fbd2bde91ba 100644
--- a/fs/nfs/netns.h
+++ b/fs/nfs/netns.h
@@ -35,6 +35,6 @@ struct nfs_net {
#endif
};
-extern int nfs_net_id;
+extern unsigned int nfs_net_id;
#endif
diff --git a/fs/nfs/nfs3client.c b/fs/nfs/nfs3client.c
index ee753547fb0a..7879f2a0fcfd 100644
--- a/fs/nfs/nfs3client.c
+++ b/fs/nfs/nfs3client.c
@@ -78,8 +78,7 @@ struct nfs_server *nfs3_clone_server(struct nfs_server *source,
*/
struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv,
const struct sockaddr *ds_addr, int ds_addrlen,
- int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans,
- rpc_authflavor_t au_flavor)
+ int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans)
{
struct rpc_timeout ds_timeout;
struct nfs_client *mds_clp = mds_srv->nfs_client;
@@ -106,7 +105,7 @@ struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv,
/* Use the MDS nfs_client cl_ipaddr. */
nfs_init_timeout_values(&ds_timeout, ds_proto, ds_timeo, ds_retrans);
- clp = nfs_get_client(&cl_init, au_flavor);
+ clp = nfs_get_client(&cl_init);
return clp;
}
diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index 608501971fe0..d12ff9385f49 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -397,10 +397,13 @@ static void
nfs42_layoutstat_release(void *calldata)
{
struct nfs42_layoutstat_data *data = calldata;
- struct nfs_server *nfss = NFS_SERVER(data->args.inode);
+ struct nfs42_layoutstat_devinfo *devinfo = data->args.devinfo;
+ int i;
- if (nfss->pnfs_curr_ld->cleanup_layoutstats)
- nfss->pnfs_curr_ld->cleanup_layoutstats(data);
+ for (i = 0; i < data->args.num_dev; i++) {
+ if (devinfo[i].ld_private.ops && devinfo[i].ld_private.ops->free)
+ devinfo[i].ld_private.ops->free(&devinfo[i].ld_private);
+ }
pnfs_put_layout_hdr(NFS_I(data->args.inode)->layout);
smp_mb__before_atomic();
diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
index 8b2605882a20..6c7296454bbc 100644
--- a/fs/nfs/nfs42xdr.c
+++ b/fs/nfs/nfs42xdr.c
@@ -181,8 +181,9 @@ static void encode_layoutstats(struct xdr_stream *xdr,
NFS4_DEVICEID4_SIZE);
/* Encode layoutupdate4 */
*p++ = cpu_to_be32(devinfo->layout_type);
- if (devinfo->layoutstats_encode != NULL)
- devinfo->layoutstats_encode(xdr, args, devinfo);
+ if (devinfo->ld_private.ops)
+ devinfo->ld_private.ops->encode(xdr, args,
+ &devinfo->ld_private);
else
encode_uint32(xdr, 0);
}
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 1452177c822d..665165833660 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -457,7 +457,7 @@ extern void nfs41_handle_server_scope(struct nfs_client *,
extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp);
extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl);
extern int nfs4_select_rw_stateid(struct nfs4_state *, fmode_t,
- const struct nfs_lockowner *, nfs4_stateid *,
+ const struct nfs_lock_context *, nfs4_stateid *,
struct rpc_cred **);
extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask);
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 074ac7131459..5ae9d64ea08b 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -464,6 +464,11 @@ static bool nfs4_match_client_owner_id(const struct nfs_client *clp1,
return strcmp(clp1->cl_owner_id, clp2->cl_owner_id) == 0;
}
+static bool nfs4_same_verifier(nfs4_verifier *v1, nfs4_verifier *v2)
+{
+ return memcmp(v1->data, v2->data, sizeof(v1->data)) == 0;
+}
+
/**
* nfs40_walk_client_list - Find server that recognizes a client ID
*
@@ -521,7 +526,21 @@ int nfs40_walk_client_list(struct nfs_client *new,
if (!nfs4_match_client_owner_id(pos, new))
continue;
-
+ /*
+ * We just sent a new SETCLIENTID, which should have
+ * caused the server to return a new cl_confirm. So if
+ * cl_confirm is the same, then this is a different
+ * server that just returned the same cl_confirm by
+ * coincidence:
+ */
+ if ((new != pos) && nfs4_same_verifier(&pos->cl_confirm,
+ &new->cl_confirm))
+ continue;
+ /*
+ * But if the cl_confirm's are different, then the only
+ * way that a SETCLIENTID_CONFIRM to pos can succeed is
+ * if new and pos point to the same server:
+ */
atomic_inc(&pos->cl_count);
spin_unlock(&nn->nfs_client_lock);
@@ -534,6 +553,7 @@ int nfs40_walk_client_list(struct nfs_client *new,
break;
case 0:
nfs4_swap_callback_idents(pos, new);
+ pos->cl_confirm = new->cl_confirm;
prev = NULL;
*result = pos;
@@ -881,7 +901,6 @@ static int nfs4_set_client(struct nfs_server *server,
const struct sockaddr *addr,
const size_t addrlen,
const char *ip_addr,
- rpc_authflavor_t authflavour,
int proto, const struct rpc_timeout *timeparms,
u32 minorversion, struct net *net)
{
@@ -907,7 +926,7 @@ static int nfs4_set_client(struct nfs_server *server,
set_bit(NFS_CS_MIGRATION, &cl_init.init_flags);
/* Allocate or find a client reference we can use */
- clp = nfs_get_client(&cl_init, authflavour);
+ clp = nfs_get_client(&cl_init);
if (IS_ERR(clp)) {
error = PTR_ERR(clp);
goto error;
@@ -948,7 +967,7 @@ error:
struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv,
const struct sockaddr *ds_addr, int ds_addrlen,
int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans,
- u32 minor_version, rpc_authflavor_t au_flavor)
+ u32 minor_version)
{
struct rpc_timeout ds_timeout;
struct nfs_client *mds_clp = mds_srv->nfs_client;
@@ -979,7 +998,7 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv,
* (section 13.1 RFC 5661).
*/
nfs_init_timeout_values(&ds_timeout, ds_proto, ds_timeo, ds_retrans);
- clp = nfs_get_client(&cl_init, au_flavor);
+ clp = nfs_get_client(&cl_init);
dprintk("<-- %s %p\n", __func__, clp);
return clp;
@@ -1103,7 +1122,6 @@ static int nfs4_init_server(struct nfs_server *server,
(const struct sockaddr *)&data->nfs_server.address,
data->nfs_server.addrlen,
data->client_address,
- data->selected_flavor,
data->nfs_server.protocol,
&timeparms,
data->minorversion,
@@ -1200,7 +1218,6 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
data->addr,
data->addrlen,
parent_client->cl_ipaddr,
- data->authflavor,
rpc_protocol(parent_server->client),
parent_server->client->cl_timeout,
parent_client->cl_mvops->minor_version,
@@ -1311,7 +1328,6 @@ int nfs4_update_server(struct nfs_server *server, const char *hostname,
nfs_server_remove_lists(server);
error = nfs4_set_client(server, hostname, sap, salen, buf,
- clp->cl_rpcclient->cl_auth->au_flavor,
clp->cl_proto, clnt->cl_timeout,
clp->cl_minorversion, net);
nfs_put_client(clp);
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index 89a77950e0b0..0efba77789b9 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -57,7 +57,7 @@ nfs4_file_open(struct inode *inode, struct file *filp)
parent = dget_parent(dentry);
dir = d_inode(parent);
- ctx = alloc_nfs_open_context(file_dentry(filp), filp->f_mode);
+ ctx = alloc_nfs_open_context(file_dentry(filp), filp->f_mode, filp);
err = PTR_ERR(ctx);
if (IS_ERR(ctx))
goto out;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 241da19b7da4..d33242c8d95d 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -94,7 +94,7 @@ static int nfs4_proc_getattr(struct nfs_server *, struct nfs_fh *, struct nfs_fa
static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr, struct nfs4_label *label);
static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
struct nfs_fattr *fattr, struct iattr *sattr,
- struct nfs4_state *state, struct nfs4_label *ilabel,
+ struct nfs_open_context *ctx, struct nfs4_label *ilabel,
struct nfs4_label *olabel);
#ifdef CONFIG_NFS_V4_1
static int nfs41_test_stateid(struct nfs_server *, nfs4_stateid *,
@@ -226,7 +226,6 @@ static const u32 nfs4_pnfs_open_bitmap[3] = {
static const u32 nfs4_open_noattr_bitmap[3] = {
FATTR4_WORD0_TYPE
- | FATTR4_WORD0_CHANGE
| FATTR4_WORD0_FILEID,
};
@@ -817,6 +816,10 @@ static int nfs41_sequence_process(struct rpc_task *task,
case -NFS4ERR_SEQ_FALSE_RETRY:
++slot->seq_nr;
goto retry_nowait;
+ case -NFS4ERR_DEADSESSION:
+ case -NFS4ERR_BADSESSION:
+ nfs4_schedule_session_recovery(session, res->sr_status);
+ goto retry_nowait;
default:
/* Just update the slot sequence no. */
slot->seq_done = 1;
@@ -1221,6 +1224,8 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
atomic_inc(&sp->so_count);
p->o_arg.open_flags = flags;
p->o_arg.fmode = fmode & (FMODE_READ|FMODE_WRITE);
+ p->o_arg.umask = current_umask();
+ p->o_arg.claim = nfs4_map_atomic_open_claim(server, claim);
p->o_arg.share_access = nfs4_map_atomic_open_share(server,
fmode, flags);
/* don't put an ACCESS op in OPEN compound if O_EXCL, because ACCESS
@@ -1228,8 +1233,16 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
if (!(flags & O_EXCL)) {
/* ask server to check for all possible rights as results
* are cached */
- p->o_arg.access = NFS4_ACCESS_READ | NFS4_ACCESS_MODIFY |
- NFS4_ACCESS_EXTEND | NFS4_ACCESS_EXECUTE;
+ switch (p->o_arg.claim) {
+ default:
+ break;
+ case NFS4_OPEN_CLAIM_NULL:
+ case NFS4_OPEN_CLAIM_FH:
+ p->o_arg.access = NFS4_ACCESS_READ |
+ NFS4_ACCESS_MODIFY |
+ NFS4_ACCESS_EXTEND |
+ NFS4_ACCESS_EXECUTE;
+ }
}
p->o_arg.clientid = server->nfs_client->cl_clientid;
p->o_arg.id.create_time = ktime_to_ns(sp->so_seqid.create_time);
@@ -1239,7 +1252,6 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
p->o_arg.bitmask = nfs4_bitmask(server, label);
p->o_arg.open_bitmap = &nfs4_fattr_bitmap[0];
p->o_arg.label = nfs4_label_copy(p->a_label, label);
- p->o_arg.claim = nfs4_map_atomic_open_claim(server, claim);
switch (p->o_arg.claim) {
case NFS4_OPEN_CLAIM_NULL:
case NFS4_OPEN_CLAIM_DELEGATE_CUR:
@@ -2819,7 +2831,7 @@ static int _nfs4_do_open(struct inode *dir,
nfs_fattr_init(opendata->o_res.f_attr);
status = nfs4_do_setattr(state->inode, cred,
opendata->o_res.f_attr, sattr,
- state, label, olabel);
+ ctx, label, olabel);
if (status == 0) {
nfs_setattr_update_inode(state->inode, sattr,
opendata->o_res.f_attr);
@@ -2914,7 +2926,7 @@ static int _nfs4_do_setattr(struct inode *inode,
struct nfs_setattrargs *arg,
struct nfs_setattrres *res,
struct rpc_cred *cred,
- struct nfs4_state *state)
+ struct nfs_open_context *ctx)
{
struct nfs_server *server = NFS_SERVER(inode);
struct rpc_message msg = {
@@ -2937,15 +2949,17 @@ static int _nfs4_do_setattr(struct inode *inode,
if (nfs4_copy_delegation_stateid(inode, fmode, &arg->stateid, &delegation_cred)) {
/* Use that stateid */
- } else if (truncate && state != NULL) {
- struct nfs_lockowner lockowner = {
- .l_owner = current->files,
- .l_pid = current->tgid,
- };
- if (!nfs4_valid_open_stateid(state))
+ } else if (truncate && ctx != NULL) {
+ struct nfs_lock_context *l_ctx;
+ if (!nfs4_valid_open_stateid(ctx->state))
return -EBADF;
- if (nfs4_select_rw_stateid(state, FMODE_WRITE, &lockowner,
- &arg->stateid, &delegation_cred) == -EIO)
+ l_ctx = nfs_get_lock_context(ctx);
+ if (IS_ERR(l_ctx))
+ return PTR_ERR(l_ctx);
+ status = nfs4_select_rw_stateid(ctx->state, FMODE_WRITE, l_ctx,
+ &arg->stateid, &delegation_cred);
+ nfs_put_lock_context(l_ctx);
+ if (status == -EIO)
return -EBADF;
} else
nfs4_stateid_copy(&arg->stateid, &zero_stateid);
@@ -2955,7 +2969,7 @@ static int _nfs4_do_setattr(struct inode *inode,
status = nfs4_call_sync(server->client, server, &msg, &arg->seq_args, &res->seq_res, 1);
put_rpccred(delegation_cred);
- if (status == 0 && state != NULL)
+ if (status == 0 && ctx != NULL)
renew_lease(server, timestamp);
trace_nfs4_setattr(inode, &arg->stateid, status);
return status;
@@ -2963,10 +2977,11 @@ static int _nfs4_do_setattr(struct inode *inode,
static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
struct nfs_fattr *fattr, struct iattr *sattr,
- struct nfs4_state *state, struct nfs4_label *ilabel,
+ struct nfs_open_context *ctx, struct nfs4_label *ilabel,
struct nfs4_label *olabel)
{
struct nfs_server *server = NFS_SERVER(inode);
+ struct nfs4_state *state = ctx ? ctx->state : NULL;
struct nfs_setattrargs arg = {
.fh = NFS_FH(inode),
.iap = sattr,
@@ -2991,7 +3006,7 @@ static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
arg.bitmask = nfs4_bitmask(server, olabel);
do {
- err = _nfs4_do_setattr(inode, &arg, &res, cred, state);
+ err = _nfs4_do_setattr(inode, &arg, &res, cred, ctx);
switch (err) {
case -NFS4ERR_OPENMODE:
if (!(sattr->ia_valid & ATTR_SIZE)) {
@@ -3028,10 +3043,15 @@ struct nfs4_closedata {
struct nfs4_state *state;
struct nfs_closeargs arg;
struct nfs_closeres res;
+ struct {
+ struct nfs4_layoutreturn_args arg;
+ struct nfs4_layoutreturn_res res;
+ struct nfs4_xdr_opaque_data ld_private;
+ u32 roc_barrier;
+ bool roc;
+ } lr;
struct nfs_fattr fattr;
unsigned long timestamp;
- bool roc;
- u32 roc_barrier;
};
static void nfs4_free_closedata(void *data)
@@ -3040,8 +3060,9 @@ static void nfs4_free_closedata(void *data)
struct nfs4_state_owner *sp = calldata->state->owner;
struct super_block *sb = calldata->state->inode->i_sb;
- if (calldata->roc)
- pnfs_roc_release(calldata->state->inode);
+ if (calldata->lr.roc)
+ pnfs_roc_release(&calldata->lr.arg, &calldata->lr.res,
+ calldata->res.lr_ret);
nfs4_put_open_state(calldata->state);
nfs_free_seqid(calldata->arg.seqid);
nfs4_put_state_owner(sp);
@@ -3060,15 +3081,38 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
if (!nfs4_sequence_done(task, &calldata->res.seq_res))
return;
trace_nfs4_close(state, &calldata->arg, &calldata->res, task->tk_status);
+
+ /* Handle Layoutreturn errors */
+ if (calldata->arg.lr_args && task->tk_status != 0) {
+ switch (calldata->res.lr_ret) {
+ default:
+ calldata->res.lr_ret = -NFS4ERR_NOMATCHING_LAYOUT;
+ break;
+ case 0:
+ calldata->arg.lr_args = NULL;
+ calldata->res.lr_res = NULL;
+ break;
+ case -NFS4ERR_ADMIN_REVOKED:
+ case -NFS4ERR_DELEG_REVOKED:
+ case -NFS4ERR_EXPIRED:
+ case -NFS4ERR_BAD_STATEID:
+ case -NFS4ERR_OLD_STATEID:
+ case -NFS4ERR_UNKNOWN_LAYOUTTYPE:
+ case -NFS4ERR_WRONG_CRED:
+ calldata->arg.lr_args = NULL;
+ calldata->res.lr_res = NULL;
+ calldata->res.lr_ret = 0;
+ rpc_restart_call_prepare(task);
+ return;
+ }
+ }
+
/* hmm. we are done with the inode, and in the process of freeing
* the state_owner. we keep this around to process errors
*/
switch (task->tk_status) {
case 0:
res_stateid = &calldata->res.stateid;
- if (calldata->roc)
- pnfs_roc_set_barrier(state->inode,
- calldata->roc_barrier);
renew_lease(server, calldata->timestamp);
break;
case -NFS4ERR_ADMIN_REVOKED:
@@ -3144,15 +3188,20 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
goto out_no_action;
}
- if (nfs4_wait_on_layoutreturn(inode, task)) {
+ if (!calldata->lr.roc && nfs4_wait_on_layoutreturn(inode, task)) {
nfs_release_seqid(calldata->arg.seqid);
goto out_wait;
}
- if (calldata->arg.fmode == 0)
+ if (calldata->arg.fmode == 0) {
task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE];
- if (calldata->roc)
- pnfs_roc_get_barrier(inode, &calldata->roc_barrier);
+
+ /* Close-to-open cache consistency revalidation */
+ if (!nfs4_have_delegation(inode, FMODE_READ))
+ calldata->arg.bitmask = NFS_SERVER(inode)->cache_consistency_bitmask;
+ else
+ calldata->arg.bitmask = NULL;
+ }
calldata->arg.share_access =
nfs4_map_atomic_open_share(NFS_SERVER(inode),
@@ -3179,13 +3228,6 @@ static const struct rpc_call_ops nfs4_close_ops = {
.rpc_release = nfs4_free_closedata,
};
-static bool nfs4_roc(struct inode *inode)
-{
- if (!nfs_have_layout(inode))
- return false;
- return pnfs_roc(inode);
-}
-
/*
* It is possible for data to be read/written from a mem-mapped file
* after the sys_close call (which hits the vfs layer as a flush).
@@ -3233,11 +3275,17 @@ int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait)
if (IS_ERR(calldata->arg.seqid))
goto out_free_calldata;
calldata->arg.fmode = 0;
- calldata->arg.bitmask = server->cache_consistency_bitmask;
+ calldata->lr.arg.ld_private = &calldata->lr.ld_private;
calldata->res.fattr = &calldata->fattr;
calldata->res.seqid = calldata->arg.seqid;
calldata->res.server = server;
- calldata->roc = nfs4_roc(state->inode);
+ calldata->res.lr_ret = -NFS4ERR_NOMATCHING_LAYOUT;
+ calldata->lr.roc = pnfs_roc(state->inode,
+ &calldata->lr.arg, &calldata->lr.res, msg.rpc_cred);
+ if (calldata->lr.roc) {
+ calldata->arg.lr_args = &calldata->lr.arg;
+ calldata->res.lr_res = &calldata->lr.res;
+ }
nfs_sb_active(calldata->inode->i_sb);
msg.rpc_argp = &calldata->arg;
@@ -3290,7 +3338,7 @@ static void nfs4_close_context(struct nfs_open_context *ctx, int is_sync)
#define FATTR4_WORD1_NFS40_MASK (2*FATTR4_WORD1_MOUNTED_ON_FILEID - 1UL)
#define FATTR4_WORD2_NFS41_MASK (2*FATTR4_WORD2_SUPPATTR_EXCLCREAT - 1UL)
-#define FATTR4_WORD2_NFS42_MASK (2*FATTR4_WORD2_SECURITY_LABEL - 1UL)
+#define FATTR4_WORD2_NFS42_MASK (2*FATTR4_WORD2_MODE_UMASK - 1UL)
static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle)
{
@@ -3687,7 +3735,7 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
{
struct inode *inode = d_inode(dentry);
struct rpc_cred *cred = NULL;
- struct nfs4_state *state = NULL;
+ struct nfs_open_context *ctx = NULL;
struct nfs4_label *label = NULL;
int status;
@@ -3708,20 +3756,17 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
/* Search for an existing open(O_WRITE) file */
if (sattr->ia_valid & ATTR_FILE) {
- struct nfs_open_context *ctx;
ctx = nfs_file_open_context(sattr->ia_file);
- if (ctx) {
+ if (ctx)
cred = ctx->cred;
- state = ctx->state;
- }
}
label = nfs4_label_alloc(NFS_SERVER(inode), GFP_KERNEL);
if (IS_ERR(label))
return PTR_ERR(label);
- status = nfs4_do_setattr(inode, cred, fattr, sattr, state, NULL, label);
+ status = nfs4_do_setattr(inode, cred, fattr, sattr, ctx, NULL, label);
if (status == 0) {
nfs_setattr_update_inode(inode, sattr, fattr);
nfs_setsecurity(inode, fattr, label);
@@ -3966,18 +4011,20 @@ static int
nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
int flags)
{
+ struct nfs_server *server = NFS_SERVER(dir);
struct nfs4_label l, *ilabel = NULL;
struct nfs_open_context *ctx;
struct nfs4_state *state;
int status = 0;
- ctx = alloc_nfs_open_context(dentry, FMODE_READ);
+ ctx = alloc_nfs_open_context(dentry, FMODE_READ, NULL);
if (IS_ERR(ctx))
return PTR_ERR(ctx);
ilabel = nfs4_label_init_security(dir, dentry, sattr, &l);
- sattr->ia_mode &= ~current_umask();
+ if (!(server->attr_bitmask[2] & FATTR4_WORD2_MODE_UMASK))
+ sattr->ia_mode &= ~current_umask();
state = nfs4_do_open(dir, ctx, flags, sattr, ilabel, NULL);
if (IS_ERR(state)) {
status = PTR_ERR(state);
@@ -4185,6 +4232,7 @@ static struct nfs4_createdata *nfs4_alloc_createdata(struct inode *dir,
data->arg.attrs = sattr;
data->arg.ftype = ftype;
data->arg.bitmask = nfs4_bitmask(server, data->label);
+ data->arg.umask = current_umask();
data->res.server = server;
data->res.fh = &data->fh;
data->res.fattr = &data->fattr;
@@ -4282,13 +4330,15 @@ out:
static int nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
struct iattr *sattr)
{
+ struct nfs_server *server = NFS_SERVER(dir);
struct nfs4_exception exception = { };
struct nfs4_label l, *label = NULL;
int err;
label = nfs4_label_init_security(dir, dentry, sattr, &l);
- sattr->ia_mode &= ~current_umask();
+ if (!(server->attr_bitmask[2] & FATTR4_WORD2_MODE_UMASK))
+ sattr->ia_mode &= ~current_umask();
do {
err = _nfs4_proc_mkdir(dir, dentry, sattr, label);
trace_nfs4_mkdir(dir, &dentry->d_name, err);
@@ -4391,13 +4441,15 @@ out:
static int nfs4_proc_mknod(struct inode *dir, struct dentry *dentry,
struct iattr *sattr, dev_t rdev)
{
+ struct nfs_server *server = NFS_SERVER(dir);
struct nfs4_exception exception = { };
struct nfs4_label l, *label = NULL;
int err;
label = nfs4_label_init_security(dir, dentry, sattr, &l);
- sattr->ia_mode &= ~current_umask();
+ if (!(server->attr_bitmask[2] & FATTR4_WORD2_MODE_UMASK))
+ sattr->ia_mode &= ~current_umask();
do {
err = _nfs4_proc_mknod(dir, dentry, sattr, label, rdev);
trace_nfs4_mknod(dir, &dentry->d_name, err);
@@ -4541,11 +4593,7 @@ int nfs4_set_rw_stateid(nfs4_stateid *stateid,
const struct nfs_lock_context *l_ctx,
fmode_t fmode)
{
- const struct nfs_lockowner *lockowner = NULL;
-
- if (l_ctx != NULL)
- lockowner = &l_ctx->lockowner;
- return nfs4_select_rw_stateid(ctx->state, fmode, lockowner, stateid, NULL);
+ return nfs4_select_rw_stateid(ctx->state, fmode, l_ctx, stateid, NULL);
}
EXPORT_SYMBOL_GPL(nfs4_set_rw_stateid);
@@ -5564,11 +5612,16 @@ struct nfs4_delegreturndata {
struct nfs_fh fh;
nfs4_stateid stateid;
unsigned long timestamp;
+ struct {
+ struct nfs4_layoutreturn_args arg;
+ struct nfs4_layoutreturn_res res;
+ struct nfs4_xdr_opaque_data ld_private;
+ u32 roc_barrier;
+ bool roc;
+ } lr;
struct nfs_fattr fattr;
int rpc_status;
struct inode *inode;
- bool roc;
- u32 roc_barrier;
};
static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
@@ -5579,6 +5632,32 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
return;
trace_nfs4_delegreturn_exit(&data->args, &data->res, task->tk_status);
+
+ /* Handle Layoutreturn errors */
+ if (data->args.lr_args && task->tk_status != 0) {
+ switch(data->res.lr_ret) {
+ default:
+ data->res.lr_ret = -NFS4ERR_NOMATCHING_LAYOUT;
+ break;
+ case 0:
+ data->args.lr_args = NULL;
+ data->res.lr_res = NULL;
+ break;
+ case -NFS4ERR_ADMIN_REVOKED:
+ case -NFS4ERR_DELEG_REVOKED:
+ case -NFS4ERR_EXPIRED:
+ case -NFS4ERR_BAD_STATEID:
+ case -NFS4ERR_OLD_STATEID:
+ case -NFS4ERR_UNKNOWN_LAYOUTTYPE:
+ case -NFS4ERR_WRONG_CRED:
+ data->args.lr_args = NULL;
+ data->res.lr_res = NULL;
+ data->res.lr_ret = 0;
+ rpc_restart_call_prepare(task);
+ return;
+ }
+ }
+
switch (task->tk_status) {
case 0:
renew_lease(data->res.server, data->timestamp);
@@ -5602,8 +5681,6 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
}
}
data->rpc_status = task->tk_status;
- if (data->roc && data->rpc_status == 0)
- pnfs_roc_set_barrier(data->inode, data->roc_barrier);
}
static void nfs4_delegreturn_release(void *calldata)
@@ -5612,8 +5689,9 @@ static void nfs4_delegreturn_release(void *calldata)
struct inode *inode = data->inode;
if (inode) {
- if (data->roc)
- pnfs_roc_release(inode);
+ if (data->lr.roc)
+ pnfs_roc_release(&data->lr.arg, &data->lr.res,
+ data->res.lr_ret);
nfs_iput_and_deactive(inode);
}
kfree(calldata);
@@ -5625,12 +5703,9 @@ static void nfs4_delegreturn_prepare(struct rpc_task *task, void *data)
d_data = (struct nfs4_delegreturndata *)data;
- if (nfs4_wait_on_layoutreturn(d_data->inode, task))
+ if (!d_data->lr.roc && nfs4_wait_on_layoutreturn(d_data->inode, task))
return;
- if (d_data->roc)
- pnfs_roc_get_barrier(d_data->inode, &d_data->roc_barrier);
-
nfs4_setup_sequence(d_data->res.server,
&d_data->args.seq_args,
&d_data->res.seq_res,
@@ -5676,12 +5751,22 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co
nfs4_stateid_copy(&data->stateid, stateid);
data->res.fattr = &data->fattr;
data->res.server = server;
+ data->res.lr_ret = -NFS4ERR_NOMATCHING_LAYOUT;
+ data->lr.arg.ld_private = &data->lr.ld_private;
nfs_fattr_init(data->res.fattr);
data->timestamp = jiffies;
data->rpc_status = 0;
+ data->lr.roc = pnfs_roc(inode, &data->lr.arg, &data->lr.res, cred);
data->inode = nfs_igrab_and_active(inode);
- if (data->inode)
- data->roc = nfs4_roc(inode);
+ if (data->inode) {
+ if (data->lr.roc) {
+ data->args.lr_args = &data->lr.arg;
+ data->res.lr_res = &data->lr.res;
+ }
+ } else if (data->lr.roc) {
+ pnfs_roc_release(&data->lr.arg, &data->lr.res, 0);
+ data->lr.roc = false;
+ }
task_setup_data.callback_data = data;
msg.rpc_argp = &data->args;
@@ -8559,21 +8644,13 @@ static void nfs4_layoutreturn_release(void *calldata)
{
struct nfs4_layoutreturn *lrp = calldata;
struct pnfs_layout_hdr *lo = lrp->args.layout;
- LIST_HEAD(freeme);
dprintk("--> %s\n", __func__);
- spin_lock(&lo->plh_inode->i_lock);
- if (lrp->res.lrs_present) {
- pnfs_mark_matching_lsegs_invalid(lo, &freeme,
- &lrp->args.range,
- be32_to_cpu(lrp->args.stateid.seqid));
- pnfs_set_layout_stateid(lo, &lrp->res.stateid, true);
- } else
- pnfs_mark_layout_stateid_invalid(lo, &freeme);
- pnfs_clear_layoutreturn_waitbit(lo);
- spin_unlock(&lo->plh_inode->i_lock);
+ pnfs_layoutreturn_free_lsegs(lo, &lrp->args.stateid, &lrp->args.range,
+ lrp->res.lrs_present ? &lrp->res.stateid : NULL);
nfs4_sequence_free_slot(&lrp->res.seq_res);
- pnfs_free_lseg_list(&freeme);
+ if (lrp->ld_private.ops && lrp->ld_private.ops->free)
+ lrp->ld_private.ops->free(&lrp->ld_private);
pnfs_put_layout_hdr(lrp->args.layout);
nfs_iput_and_deactive(lrp->inode);
kfree(calldata);
diff --git a/fs/nfs/nfs4session.c b/fs/nfs/nfs4session.c
index a61350f75c74..769b85655c4b 100644
--- a/fs/nfs/nfs4session.c
+++ b/fs/nfs/nfs4session.c
@@ -169,7 +169,7 @@ bool nfs4_try_to_lock_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *slot)
struct nfs4_slot *nfs4_lookup_slot(struct nfs4_slot_table *tbl, u32 slotid)
{
if (slotid <= tbl->max_slotid)
- return nfs4_find_or_create_slot(tbl, slotid, 1, GFP_NOWAIT);
+ return nfs4_find_or_create_slot(tbl, slotid, 0, GFP_NOWAIT);
return ERR_PTR(-E2BIG);
}
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 0959c9661662..95baf7d340f0 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -800,11 +800,13 @@ void nfs4_close_sync(struct nfs4_state *state, fmode_t fmode)
* that is compatible with current->files
*/
static struct nfs4_lock_state *
-__nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner)
+__nfs4_find_lock_state(struct nfs4_state *state,
+ fl_owner_t fl_owner, fl_owner_t fl_owner2)
{
struct nfs4_lock_state *pos;
list_for_each_entry(pos, &state->lock_states, ls_locks) {
- if (pos->ls_owner != fl_owner)
+ if (pos->ls_owner != fl_owner &&
+ pos->ls_owner != fl_owner2)
continue;
atomic_inc(&pos->ls_count);
return pos;
@@ -857,7 +859,7 @@ static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_
for(;;) {
spin_lock(&state->state_lock);
- lsp = __nfs4_find_lock_state(state, owner);
+ lsp = __nfs4_find_lock_state(state, owner, 0);
if (lsp != NULL)
break;
if (new != NULL) {
@@ -939,22 +941,23 @@ int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl)
static int nfs4_copy_lock_stateid(nfs4_stateid *dst,
struct nfs4_state *state,
- const struct nfs_lockowner *lockowner)
+ const struct nfs_lock_context *l_ctx)
{
struct nfs4_lock_state *lsp;
- fl_owner_t fl_owner;
+ fl_owner_t fl_owner, fl_flock_owner;
int ret = -ENOENT;
-
- if (lockowner == NULL)
+ if (l_ctx == NULL)
goto out;
if (test_bit(LK_STATE_IN_USE, &state->flags) == 0)
goto out;
- fl_owner = lockowner->l_owner;
+ fl_owner = l_ctx->lockowner;
+ fl_flock_owner = l_ctx->open_context->flock_owner;
+
spin_lock(&state->state_lock);
- lsp = __nfs4_find_lock_state(state, fl_owner);
+ lsp = __nfs4_find_lock_state(state, fl_owner, fl_flock_owner);
if (lsp && test_bit(NFS_LOCK_LOST, &lsp->ls_flags))
ret = -EIO;
else if (lsp != NULL && test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) != 0) {
@@ -986,7 +989,7 @@ static void nfs4_copy_open_stateid(nfs4_stateid *dst, struct nfs4_state *state)
* requests.
*/
int nfs4_select_rw_stateid(struct nfs4_state *state,
- fmode_t fmode, const struct nfs_lockowner *lockowner,
+ fmode_t fmode, const struct nfs_lock_context *l_ctx,
nfs4_stateid *dst, struct rpc_cred **cred)
{
int ret;
@@ -995,7 +998,7 @@ int nfs4_select_rw_stateid(struct nfs4_state *state,
return -EIO;
if (cred != NULL)
*cred = NULL;
- ret = nfs4_copy_lock_stateid(dst, state, lockowner);
+ ret = nfs4_copy_lock_stateid(dst, state, l_ctx);
if (ret == -EIO)
/* A lost lock - don't even consider delegations */
goto out;
@@ -2190,7 +2193,7 @@ void nfs4_schedule_session_recovery(struct nfs4_session *session, int err)
case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
set_bit(NFS4CLNT_BIND_CONN_TO_SESSION, &clp->cl_state);
}
- nfs4_schedule_lease_recovery(clp);
+ nfs4_schedule_state_manager(clp);
}
EXPORT_SYMBOL_GPL(nfs4_schedule_session_recovery);
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index fc89e5ed07ee..1af6268a7d8c 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -52,6 +52,7 @@
#include <linux/nfs.h>
#include <linux/nfs4.h>
#include <linux/nfs_fs.h>
+#include <linux/fs_struct.h>
#include "nfs4_fs.h"
#include "internal.h"
@@ -415,6 +416,8 @@ static int nfs4_stat_to_errno(int);
#else /* CONFIG_NFS_V4_1 */
#define encode_sequence_maxsz 0
#define decode_sequence_maxsz 0
+#define encode_layoutreturn_maxsz 0
+#define decode_layoutreturn_maxsz 0
#endif /* CONFIG_NFS_V4_1 */
#define NFS4_enc_compound_sz (1024) /* XXX: large enough? */
@@ -499,22 +502,22 @@ static int nfs4_stat_to_errno(int);
(compound_encode_hdr_maxsz + \
encode_sequence_maxsz + \
encode_putfh_maxsz + \
- encode_open_downgrade_maxsz + \
- encode_getattr_maxsz)
+ encode_open_downgrade_maxsz)
#define NFS4_dec_open_downgrade_sz \
(compound_decode_hdr_maxsz + \
decode_sequence_maxsz + \
decode_putfh_maxsz + \
- decode_open_downgrade_maxsz + \
- decode_getattr_maxsz)
+ decode_open_downgrade_maxsz)
#define NFS4_enc_close_sz (compound_encode_hdr_maxsz + \
encode_sequence_maxsz + \
encode_putfh_maxsz + \
+ encode_layoutreturn_maxsz + \
encode_close_maxsz + \
encode_getattr_maxsz)
#define NFS4_dec_close_sz (compound_decode_hdr_maxsz + \
decode_sequence_maxsz + \
decode_putfh_maxsz + \
+ decode_layoutreturn_maxsz + \
decode_close_maxsz + \
decode_getattr_maxsz)
#define NFS4_enc_setattr_sz (compound_encode_hdr_maxsz + \
@@ -708,10 +711,13 @@ static int nfs4_stat_to_errno(int);
#define NFS4_enc_delegreturn_sz (compound_encode_hdr_maxsz + \
encode_sequence_maxsz + \
encode_putfh_maxsz + \
+ encode_layoutreturn_maxsz + \
encode_delegreturn_maxsz + \
encode_getattr_maxsz)
#define NFS4_dec_delegreturn_sz (compound_decode_hdr_maxsz + \
decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_layoutreturn_maxsz + \
decode_delegreturn_maxsz + \
decode_getattr_maxsz)
#define NFS4_enc_getacl_sz (compound_encode_hdr_maxsz + \
@@ -1003,7 +1009,7 @@ static void encode_nfs4_verifier(struct xdr_stream *xdr, const nfs4_verifier *ve
static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap,
const struct nfs4_label *label,
const struct nfs_server *server,
- bool excl_check)
+ bool excl_check, const umode_t *umask)
{
char owner_name[IDMAP_NAMESZ];
char owner_group[IDMAP_NAMESZ];
@@ -1017,18 +1023,21 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap,
/*
* We reserve enough space to write the entire attribute buffer at once.
- * In the worst-case, this would be
- * 16(bitmap) + 4(attrlen) + 8(size) + 4(mode) + 4(atime) + 4(mtime)
- * = 40 bytes, plus any contribution from variable-length fields
- * such as owner/group.
*/
if (iap->ia_valid & ATTR_SIZE) {
bmval[0] |= FATTR4_WORD0_SIZE;
len += 8;
}
+ if (!(server->attr_bitmask[2] & FATTR4_WORD2_MODE_UMASK))
+ umask = NULL;
if (iap->ia_valid & ATTR_MODE) {
- bmval[1] |= FATTR4_WORD1_MODE;
- len += 4;
+ if (umask) {
+ bmval[2] |= FATTR4_WORD2_MODE_UMASK;
+ len += 8;
+ } else {
+ bmval[1] |= FATTR4_WORD1_MODE;
+ len += 4;
+ }
}
if (iap->ia_valid & ATTR_UID) {
owner_namelen = nfs_map_uid_to_name(server, iap->ia_uid, owner_name, IDMAP_NAMESZ);
@@ -1129,6 +1138,10 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap,
*p++ = cpu_to_be32(label->len);
p = xdr_encode_opaque_fixed(p, label->label, label->len);
}
+ if (bmval[2] & FATTR4_WORD2_MODE_UMASK) {
+ *p++ = cpu_to_be32(iap->ia_mode & S_IALLUGO);
+ *p++ = cpu_to_be32(*umask);
+ }
/* out: */
}
@@ -1183,7 +1196,8 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *
}
encode_string(xdr, create->name->len, create->name->name);
- encode_attrs(xdr, create->attrs, create->label, create->server, false);
+ encode_attrs(xdr, create->attrs, create->label, create->server, false,
+ &create->umask);
}
static void encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap, struct compound_hdr *hdr)
@@ -1403,11 +1417,13 @@ static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_op
switch(arg->createmode) {
case NFS4_CREATE_UNCHECKED:
*p = cpu_to_be32(NFS4_CREATE_UNCHECKED);
- encode_attrs(xdr, arg->u.attrs, arg->label, arg->server, false);
+ encode_attrs(xdr, arg->u.attrs, arg->label, arg->server, false,
+ &arg->umask);
break;
case NFS4_CREATE_GUARDED:
*p = cpu_to_be32(NFS4_CREATE_GUARDED);
- encode_attrs(xdr, arg->u.attrs, arg->label, arg->server, false);
+ encode_attrs(xdr, arg->u.attrs, arg->label, arg->server, false,
+ &arg->umask);
break;
case NFS4_CREATE_EXCLUSIVE:
*p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE);
@@ -1416,7 +1432,8 @@ static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_op
case NFS4_CREATE_EXCLUSIVE4_1:
*p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE4_1);
encode_nfs4_verifier(xdr, &arg->u.verifier);
- encode_attrs(xdr, arg->u.attrs, arg->label, arg->server, true);
+ encode_attrs(xdr, arg->u.attrs, arg->label, arg->server, true,
+ &arg->umask);
}
}
@@ -1672,7 +1689,7 @@ static void encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs
{
encode_op_hdr(xdr, OP_SETATTR, decode_setattr_maxsz, hdr);
encode_nfs4_stateid(xdr, &arg->stateid);
- encode_attrs(xdr, arg->iap, arg->label, server, false);
+ encode_attrs(xdr, arg->iap, arg->label, server, false, NULL);
}
static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclientid *setclientid, struct compound_hdr *hdr)
@@ -2015,6 +2032,7 @@ encode_layoutreturn(struct xdr_stream *xdr,
const struct nfs4_layoutreturn_args *args,
struct compound_hdr *hdr)
{
+ const struct pnfs_layoutdriver_type *lr_ops = NFS_SERVER(args->inode)->pnfs_curr_ld;
__be32 *p;
encode_op_hdr(xdr, OP_LAYOUTRETURN, decode_layoutreturn_maxsz, hdr);
@@ -2029,10 +2047,11 @@ encode_layoutreturn(struct xdr_stream *xdr,
spin_lock(&args->inode->i_lock);
encode_nfs4_stateid(xdr, &args->stateid);
spin_unlock(&args->inode->i_lock);
- if (NFS_SERVER(args->inode)->pnfs_curr_ld->encode_layoutreturn) {
- NFS_SERVER(args->inode)->pnfs_curr_ld->encode_layoutreturn(
- NFS_I(args->inode)->layout, xdr, args);
- } else
+ if (args->ld_private->ops && args->ld_private->ops->encode)
+ args->ld_private->ops->encode(xdr, args, args->ld_private);
+ else if (lr_ops->encode_layoutreturn)
+ lr_ops->encode_layoutreturn(xdr, args);
+ else
encode_uint32(xdr, 0);
}
@@ -2062,6 +2081,13 @@ static void encode_free_stateid(struct xdr_stream *xdr,
encode_op_hdr(xdr, OP_FREE_STATEID, decode_free_stateid_maxsz, hdr);
encode_nfs4_stateid(xdr, &args->stateid);
}
+#else
+static inline void
+encode_layoutreturn(struct xdr_stream *xdr,
+ const struct nfs4_layoutreturn_args *args,
+ struct compound_hdr *hdr)
+{
+}
#endif /* CONFIG_NFS_V4_1 */
/*
@@ -2249,8 +2275,11 @@ static void nfs4_xdr_enc_close(struct rpc_rqst *req, struct xdr_stream *xdr,
encode_compound_hdr(xdr, req, &hdr);
encode_sequence(xdr, &args->seq_args, &hdr);
encode_putfh(xdr, args->fh, &hdr);
+ if (args->lr_args)
+ encode_layoutreturn(xdr, args->lr_args, &hdr);
encode_close(xdr, args, &hdr);
- encode_getfattr(xdr, args->bitmask, &hdr);
+ if (args->bitmask != NULL)
+ encode_getfattr(xdr, args->bitmask, &hdr);
encode_nops(&hdr);
}
@@ -2328,7 +2357,6 @@ static void nfs4_xdr_enc_open_downgrade(struct rpc_rqst *req,
encode_sequence(xdr, &args->seq_args, &hdr);
encode_putfh(xdr, args->fh, &hdr);
encode_open_downgrade(xdr, args, &hdr);
- encode_getfattr(xdr, args->bitmask, &hdr);
encode_nops(&hdr);
}
@@ -2671,6 +2699,8 @@ static void nfs4_xdr_enc_delegreturn(struct rpc_rqst *req,
encode_compound_hdr(xdr, req, &hdr);
encode_sequence(xdr, &args->seq_args, &hdr);
encode_putfh(xdr, args->fhandle, &hdr);
+ if (args->lr_args)
+ encode_layoutreturn(xdr, args->lr_args, &hdr);
encode_getfattr(xdr, args->bitmask, &hdr);
encode_delegreturn(xdr, args->stateid, &hdr);
encode_nops(&hdr);
@@ -6089,6 +6119,13 @@ static int decode_free_stateid(struct xdr_stream *xdr,
res->status = decode_op_hdr(xdr, OP_FREE_STATEID);
return res->status;
}
+#else
+static inline
+int decode_layoutreturn(struct xdr_stream *xdr,
+ struct nfs4_layoutreturn_res *res)
+{
+ return 0;
+}
#endif /* CONFIG_NFS_V4_1 */
/*
@@ -6115,9 +6152,6 @@ static int nfs4_xdr_dec_open_downgrade(struct rpc_rqst *rqstp,
if (status)
goto out;
status = decode_open_downgrade(xdr, res);
- if (status != 0)
- goto out;
- decode_getfattr(xdr, res->fattr, res->server);
out:
return status;
}
@@ -6444,6 +6478,12 @@ static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
status = decode_putfh(xdr);
if (status)
goto out;
+ if (res->lr_res) {
+ status = decode_layoutreturn(xdr, res->lr_res);
+ res->lr_ret = status;
+ if (status)
+ goto out;
+ }
status = decode_close(xdr, res);
if (status != 0)
goto out;
@@ -6920,6 +6960,12 @@ static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp,
status = decode_putfh(xdr);
if (status != 0)
goto out;
+ if (res->lr_res) {
+ status = decode_layoutreturn(xdr, res->lr_res);
+ res->lr_ret = status;
+ if (status)
+ goto out;
+ }
status = decode_getfattr(xdr, res->fattr, res->server);
if (status != 0)
goto out;
diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
index 919efd4a1a23..2a4cdce939a0 100644
--- a/fs/nfs/objlayout/objlayout.c
+++ b/fs/nfs/objlayout/objlayout.c
@@ -504,10 +504,10 @@ encode_accumulated_error(struct objlayout *objlay, __be32 *p)
}
void
-objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay,
- struct xdr_stream *xdr,
+objlayout_encode_layoutreturn(struct xdr_stream *xdr,
const struct nfs4_layoutreturn_args *args)
{
+ struct pnfs_layout_hdr *pnfslay = args->layout;
struct objlayout *objlay = OBJLAYOUT(pnfslay);
struct objlayout_io_res *oir, *tmp;
__be32 *start;
diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h
index 2641dbad345c..fc94a5872ed4 100644
--- a/fs/nfs/objlayout/objlayout.h
+++ b/fs/nfs/objlayout/objlayout.h
@@ -175,7 +175,6 @@ extern void objlayout_encode_layoutcommit(
const struct nfs4_layoutcommit_args *);
extern void objlayout_encode_layoutreturn(
- struct pnfs_layout_hdr *,
struct xdr_stream *,
const struct nfs4_layoutreturn_args *);
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 965db474f4b0..6e629b856a00 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -867,8 +867,7 @@ static void nfs_pageio_cleanup_mirroring(struct nfs_pageio_descriptor *pgio)
static bool nfs_match_lock_context(const struct nfs_lock_context *l1,
const struct nfs_lock_context *l2)
{
- return l1->lockowner.l_owner == l2->lockowner.l_owner
- && l1->lockowner.l_pid == l2->lockowner.l_pid;
+ return l1->lockowner == l2->lockowner;
}
/**
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 259ef85f435a..896df7bdf85f 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -54,6 +54,12 @@ static DEFINE_SPINLOCK(pnfs_spinlock);
static LIST_HEAD(pnfs_modules_tbl);
static void pnfs_layoutreturn_before_put_layout_hdr(struct pnfs_layout_hdr *lo);
+static void pnfs_free_returned_lsegs(struct pnfs_layout_hdr *lo,
+ struct list_head *free_me,
+ const struct pnfs_layout_range *range,
+ u32 seq);
+static bool pnfs_lseg_dec_and_remove_zero(struct pnfs_layout_segment *lseg,
+ struct list_head *tmp_list);
/* Return the registered pnfs layout driver module matching given id */
static struct pnfs_layoutdriver_type *
@@ -299,6 +305,49 @@ pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo)
}
}
+static void
+pnfs_set_plh_return_info(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode,
+ u32 seq)
+{
+ if (lo->plh_return_iomode != 0 && lo->plh_return_iomode != iomode)
+ iomode = IOMODE_ANY;
+ lo->plh_return_iomode = iomode;
+ set_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags);
+ if (seq != 0) {
+ WARN_ON_ONCE(lo->plh_return_seq != 0 && lo->plh_return_seq != seq);
+ lo->plh_return_seq = seq;
+ }
+}
+
+static void
+pnfs_clear_layoutreturn_info(struct pnfs_layout_hdr *lo)
+{
+ lo->plh_return_iomode = 0;
+ lo->plh_return_seq = 0;
+ clear_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags);
+}
+
+static void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo)
+{
+ clear_bit_unlock(NFS_LAYOUT_RETURN, &lo->plh_flags);
+ clear_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags);
+ smp_mb__after_atomic();
+ wake_up_bit(&lo->plh_flags, NFS_LAYOUT_RETURN);
+ rpc_wake_up(&NFS_SERVER(lo->plh_inode)->roc_rpcwaitq);
+}
+
+static void
+pnfs_clear_lseg_state(struct pnfs_layout_segment *lseg,
+ struct list_head *free_me)
+{
+ clear_bit(NFS_LSEG_ROC, &lseg->pls_flags);
+ clear_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags);
+ if (test_and_clear_bit(NFS_LSEG_VALID, &lseg->pls_flags))
+ pnfs_lseg_dec_and_remove_zero(lseg, free_me);
+ if (test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags))
+ pnfs_lseg_dec_and_remove_zero(lseg, free_me);
+}
+
/*
* Mark a pnfs_layout_hdr and all associated layout segments as invalid
*
@@ -315,9 +364,17 @@ pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo,
.offset = 0,
.length = NFS4_MAX_UINT64,
};
+ struct pnfs_layout_segment *lseg, *next;
set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
- return pnfs_mark_matching_lsegs_invalid(lo, lseg_list, &range, 0);
+ pnfs_clear_layoutreturn_info(lo);
+ list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
+ pnfs_clear_lseg_state(lseg, lseg_list);
+ pnfs_free_returned_lsegs(lo, lseg_list, &range, 0);
+ if (test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags) &&
+ !test_and_set_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags))
+ pnfs_clear_layoutreturn_waitbit(lo);
+ return !list_empty(&lo->plh_segs);
}
static int
@@ -396,27 +453,42 @@ pnfs_init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg,
static void pnfs_free_lseg(struct pnfs_layout_segment *lseg)
{
- struct inode *ino = lseg->pls_layout->plh_inode;
-
- NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
+ if (lseg != NULL) {
+ struct inode *inode = lseg->pls_layout->plh_inode;
+ NFS_SERVER(inode)->pnfs_curr_ld->free_lseg(lseg);
+ }
}
static void
pnfs_layout_remove_lseg(struct pnfs_layout_hdr *lo,
struct pnfs_layout_segment *lseg)
{
- struct inode *inode = lo->plh_inode;
-
WARN_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
list_del_init(&lseg->pls_list);
/* Matched by pnfs_get_layout_hdr in pnfs_layout_insert_lseg */
atomic_dec(&lo->plh_refcount);
- if (list_empty(&lo->plh_segs)) {
+ if (test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags))
+ return;
+ if (list_empty(&lo->plh_segs) &&
+ !test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags) &&
+ !test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) {
if (atomic_read(&lo->plh_outstanding) == 0)
set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
clear_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
}
- rpc_wake_up(&NFS_SERVER(inode)->roc_rpcwaitq);
+}
+
+static bool
+pnfs_cache_lseg_for_layoutreturn(struct pnfs_layout_hdr *lo,
+ struct pnfs_layout_segment *lseg)
+{
+ if (test_and_clear_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags) &&
+ pnfs_layout_is_valid(lo)) {
+ pnfs_set_plh_return_info(lo, lseg->pls_range.iomode, 0);
+ list_move_tail(&lseg->pls_list, &lo->plh_return_segs);
+ return true;
+ }
+ return false;
}
void
@@ -442,6 +514,8 @@ pnfs_put_lseg(struct pnfs_layout_segment *lseg)
}
pnfs_get_layout_hdr(lo);
pnfs_layout_remove_lseg(lo, lseg);
+ if (pnfs_cache_lseg_for_layoutreturn(lo, lseg))
+ lseg = NULL;
spin_unlock(&inode->i_lock);
pnfs_free_lseg(lseg);
pnfs_put_layout_hdr(lo);
@@ -482,22 +556,15 @@ pnfs_put_lseg_locked(struct pnfs_layout_segment *lseg)
struct pnfs_layout_hdr *lo = lseg->pls_layout;
if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags))
return;
- pnfs_get_layout_hdr(lo);
pnfs_layout_remove_lseg(lo, lseg);
- pnfs_free_lseg_async(lseg);
+ if (!pnfs_cache_lseg_for_layoutreturn(lo, lseg)) {
+ pnfs_get_layout_hdr(lo);
+ pnfs_free_lseg_async(lseg);
+ }
}
}
EXPORT_SYMBOL_GPL(pnfs_put_lseg_locked);
-static u64
-end_offset(u64 start, u64 len)
-{
- u64 end;
-
- end = start + len;
- return end >= start ? end : NFS4_MAX_UINT64;
-}
-
/*
* is l2 fully contained in l1?
* start1 end1
@@ -510,33 +577,13 @@ pnfs_lseg_range_contained(const struct pnfs_layout_range *l1,
const struct pnfs_layout_range *l2)
{
u64 start1 = l1->offset;
- u64 end1 = end_offset(start1, l1->length);
+ u64 end1 = pnfs_end_offset(start1, l1->length);
u64 start2 = l2->offset;
- u64 end2 = end_offset(start2, l2->length);
+ u64 end2 = pnfs_end_offset(start2, l2->length);
return (start1 <= start2) && (end1 >= end2);
}
-/*
- * is l1 and l2 intersecting?
- * start1 end1
- * [----------------------------------)
- * start2 end2
- * [----------------)
- */
-static bool
-pnfs_lseg_range_intersecting(const struct pnfs_layout_range *l1,
- const struct pnfs_layout_range *l2)
-{
- u64 start1 = l1->offset;
- u64 end1 = end_offset(start1, l1->length);
- u64 start2 = l2->offset;
- u64 end2 = end_offset(start2, l2->length);
-
- return (end1 == NFS4_MAX_UINT64 || end1 > start2) &&
- (end2 == NFS4_MAX_UINT64 || end2 > start1);
-}
-
static bool pnfs_lseg_dec_and_remove_zero(struct pnfs_layout_segment *lseg,
struct list_head *tmp_list)
{
@@ -637,6 +684,20 @@ pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
return remaining;
}
+static void
+pnfs_free_returned_lsegs(struct pnfs_layout_hdr *lo,
+ struct list_head *free_me,
+ const struct pnfs_layout_range *range,
+ u32 seq)
+{
+ struct pnfs_layout_segment *lseg, *next;
+
+ list_for_each_entry_safe(lseg, next, &lo->plh_return_segs, pls_list) {
+ if (pnfs_match_lseg_recall(lseg, range, seq))
+ list_move_tail(&lseg->pls_list, free_me);
+ }
+}
+
/* note free_me must contain lsegs from a single layout_hdr */
void
pnfs_free_lseg_list(struct list_head *free_me)
@@ -701,6 +762,8 @@ pnfs_layout_bulk_destroy_byserver_locked(struct nfs_client *clp,
struct inode *inode;
list_for_each_entry_safe(lo, next, &server->layouts, plh_layouts) {
+ if (test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags))
+ continue;
inode = igrab(lo->plh_inode);
if (inode == NULL)
continue;
@@ -816,14 +879,6 @@ pnfs_destroy_all_layouts(struct nfs_client *clp)
pnfs_destroy_layouts_byclid(clp, false);
}
-static void
-pnfs_clear_layoutreturn_info(struct pnfs_layout_hdr *lo)
-{
- lo->plh_return_iomode = 0;
- lo->plh_return_seq = 0;
- clear_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags);
-}
-
/* update lo->plh_stateid with new if is more recent */
void
pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new,
@@ -941,12 +996,31 @@ static void pnfs_clear_layoutcommit(struct inode *inode,
}
}
-void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo)
+void pnfs_layoutreturn_free_lsegs(struct pnfs_layout_hdr *lo,
+ const nfs4_stateid *arg_stateid,
+ const struct pnfs_layout_range *range,
+ const nfs4_stateid *stateid)
{
- clear_bit_unlock(NFS_LAYOUT_RETURN, &lo->plh_flags);
- smp_mb__after_atomic();
- wake_up_bit(&lo->plh_flags, NFS_LAYOUT_RETURN);
- rpc_wake_up(&NFS_SERVER(lo->plh_inode)->roc_rpcwaitq);
+ struct inode *inode = lo->plh_inode;
+ LIST_HEAD(freeme);
+
+ spin_lock(&inode->i_lock);
+ if (!pnfs_layout_is_valid(lo) || !arg_stateid ||
+ !nfs4_stateid_match_other(&lo->plh_stateid, arg_stateid))
+ goto out_unlock;
+ if (stateid) {
+ u32 seq = be32_to_cpu(arg_stateid->seqid);
+
+ pnfs_mark_matching_lsegs_invalid(lo, &freeme, range, seq);
+ pnfs_free_returned_lsegs(lo, &freeme, range, seq);
+ pnfs_set_layout_stateid(lo, stateid, true);
+ } else
+ pnfs_mark_layout_stateid_invalid(lo, &freeme);
+out_unlock:
+ pnfs_clear_layoutreturn_waitbit(lo);
+ spin_unlock(&inode->i_lock);
+ pnfs_free_lseg_list(&freeme);
+
}
static bool
@@ -957,8 +1031,9 @@ pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo,
/* Serialise LAYOUTGET/LAYOUTRETURN */
if (atomic_read(&lo->plh_outstanding) != 0)
return false;
- if (test_and_set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags))
+ if (test_and_set_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags))
return false;
+ set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags);
pnfs_get_layout_hdr(lo);
if (test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags)) {
if (stateid != NULL) {
@@ -978,11 +1053,29 @@ pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo,
return true;
}
+static void
+pnfs_init_layoutreturn_args(struct nfs4_layoutreturn_args *args,
+ struct pnfs_layout_hdr *lo,
+ const nfs4_stateid *stateid,
+ enum pnfs_iomode iomode)
+{
+ struct inode *inode = lo->plh_inode;
+
+ args->layout_type = NFS_SERVER(inode)->pnfs_curr_ld->id;
+ args->inode = inode;
+ args->range.iomode = iomode;
+ args->range.offset = 0;
+ args->range.length = NFS4_MAX_UINT64;
+ args->layout = lo;
+ nfs4_stateid_copy(&args->stateid, stateid);
+}
+
static int
pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, const nfs4_stateid *stateid,
enum pnfs_iomode iomode, bool sync)
{
struct inode *ino = lo->plh_inode;
+ struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld;
struct nfs4_layoutreturn *lrp;
int status = 0;
@@ -996,15 +1089,12 @@ pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, const nfs4_stateid *stateid,
goto out;
}
- nfs4_stateid_copy(&lrp->args.stateid, stateid);
- lrp->args.layout_type = NFS_SERVER(ino)->pnfs_curr_ld->id;
- lrp->args.inode = ino;
- lrp->args.range.iomode = iomode;
- lrp->args.range.offset = 0;
- lrp->args.range.length = NFS4_MAX_UINT64;
- lrp->args.layout = lo;
+ pnfs_init_layoutreturn_args(&lrp->args, lo, stateid, iomode);
+ lrp->args.ld_private = &lrp->ld_private;
lrp->clp = NFS_SERVER(ino)->nfs_client;
lrp->cred = lo->plh_lc_cred;
+ if (ld->prepare_layoutreturn)
+ ld->prepare_layoutreturn(&lrp->args);
status = nfs4_proc_layoutreturn(lrp, sync);
out:
@@ -1067,7 +1157,7 @@ _pnfs_return_layout(struct inode *ino)
struct nfs_inode *nfsi = NFS_I(ino);
LIST_HEAD(tmp_list);
nfs4_stateid stateid;
- int status = 0, empty;
+ int status = 0;
bool send;
dprintk("NFS: %s for inode %lu\n", __func__, ino->i_ino);
@@ -1081,7 +1171,14 @@ _pnfs_return_layout(struct inode *ino)
}
/* Reference matched in nfs4_layoutreturn_release */
pnfs_get_layout_hdr(lo);
- empty = list_empty(&lo->plh_segs);
+ /* Is there an outstanding layoutreturn ? */
+ if (test_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags)) {
+ spin_unlock(&ino->i_lock);
+ if (wait_on_bit(&lo->plh_flags, NFS_LAYOUT_RETURN,
+ TASK_UNINTERRUPTIBLE))
+ goto out_put_layout_hdr;
+ spin_lock(&ino->i_lock);
+ }
pnfs_clear_layoutcommit(ino, &tmp_list);
pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL, 0);
@@ -1095,7 +1192,7 @@ _pnfs_return_layout(struct inode *ino)
}
/* Don't send a LAYOUTRETURN if list was initially empty */
- if (empty) {
+ if (!test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags)) {
spin_unlock(&ino->i_lock);
dprintk("NFS: %s no layout segments to return\n", __func__);
goto out_put_layout_hdr;
@@ -1141,21 +1238,36 @@ pnfs_commit_and_return_layout(struct inode *inode)
return ret;
}
-bool pnfs_roc(struct inode *ino)
+bool pnfs_roc(struct inode *ino,
+ struct nfs4_layoutreturn_args *args,
+ struct nfs4_layoutreturn_res *res,
+ const struct rpc_cred *cred)
{
struct nfs_inode *nfsi = NFS_I(ino);
struct nfs_open_context *ctx;
struct nfs4_state *state;
struct pnfs_layout_hdr *lo;
- struct pnfs_layout_segment *lseg, *tmp;
+ struct pnfs_layout_segment *lseg, *next;
nfs4_stateid stateid;
- LIST_HEAD(tmp_list);
- bool found = false, layoutreturn = false, roc = false;
+ enum pnfs_iomode iomode = 0;
+ bool layoutreturn = false, roc = false;
+ if (!nfs_have_layout(ino))
+ return false;
+retry:
spin_lock(&ino->i_lock);
lo = nfsi->layout;
- if (!lo || test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags))
+ if (!lo || !pnfs_layout_is_valid(lo) ||
+ test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags))
goto out_noroc;
+ if (test_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags)) {
+ pnfs_get_layout_hdr(lo);
+ spin_unlock(&ino->i_lock);
+ wait_on_bit(&lo->plh_flags, NFS_LAYOUT_RETURN,
+ TASK_UNINTERRUPTIBLE);
+ pnfs_put_layout_hdr(lo);
+ goto retry;
+ }
/* no roc if we hold a delegation */
if (nfs4_check_delegation(ino, FMODE_READ))
@@ -1168,78 +1280,73 @@ bool pnfs_roc(struct inode *ino)
goto out_noroc;
}
- /* always send layoutreturn if being marked so */
- if (test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags))
- layoutreturn = pnfs_prepare_layoutreturn(lo,
- &stateid, NULL);
- list_for_each_entry_safe(lseg, tmp, &lo->plh_segs, pls_list)
+ list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) {
/* If we are sending layoutreturn, invalidate all valid lsegs */
- if (layoutreturn || test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) {
- mark_lseg_invalid(lseg, &tmp_list);
- found = true;
- }
+ if (!test_and_clear_bit(NFS_LSEG_ROC, &lseg->pls_flags))
+ continue;
+ /*
+ * Note: mark lseg for return so pnfs_layout_remove_lseg
+ * doesn't invalidate the layout for us.
+ */
+ set_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags);
+ if (!mark_lseg_invalid(lseg, &lo->plh_return_segs))
+ continue;
+ pnfs_set_plh_return_info(lo, lseg->pls_range.iomode, 0);
+ }
+
+ if (!test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags))
+ goto out_noroc;
+
/* ROC in two conditions:
* 1. there are ROC lsegs
* 2. we don't send layoutreturn
*/
- if (found && !layoutreturn) {
- /* lo ref dropped in pnfs_roc_release() */
- pnfs_get_layout_hdr(lo);
- roc = true;
- }
+ /* lo ref dropped in pnfs_roc_release() */
+ layoutreturn = pnfs_prepare_layoutreturn(lo, &stateid, &iomode);
+ /* If the creds don't match, we can't compound the layoutreturn */
+ if (!layoutreturn || cred != lo->plh_lc_cred)
+ goto out_noroc;
+
+ roc = layoutreturn;
+ pnfs_init_layoutreturn_args(args, lo, &stateid, iomode);
+ res->lrs_present = 0;
+ layoutreturn = false;
out_noroc:
spin_unlock(&ino->i_lock);
- pnfs_free_lseg_list(&tmp_list);
pnfs_layoutcommit_inode(ino, true);
+ if (roc) {
+ struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld;
+ if (ld->prepare_layoutreturn)
+ ld->prepare_layoutreturn(args);
+ return true;
+ }
if (layoutreturn)
- pnfs_send_layoutreturn(lo, &stateid, IOMODE_ANY, true);
- return roc;
-}
-
-void pnfs_roc_release(struct inode *ino)
-{
- struct pnfs_layout_hdr *lo;
-
- spin_lock(&ino->i_lock);
- lo = NFS_I(ino)->layout;
- pnfs_clear_layoutreturn_waitbit(lo);
- if (atomic_dec_and_test(&lo->plh_refcount)) {
- pnfs_detach_layout_hdr(lo);
- spin_unlock(&ino->i_lock);
- pnfs_free_layout_hdr(lo);
- } else
- spin_unlock(&ino->i_lock);
+ pnfs_send_layoutreturn(lo, &stateid, iomode, true);
+ return false;
}
-void pnfs_roc_set_barrier(struct inode *ino, u32 barrier)
+void pnfs_roc_release(struct nfs4_layoutreturn_args *args,
+ struct nfs4_layoutreturn_res *res,
+ int ret)
{
- struct pnfs_layout_hdr *lo;
+ struct pnfs_layout_hdr *lo = args->layout;
+ const nfs4_stateid *arg_stateid = NULL;
+ const nfs4_stateid *res_stateid = NULL;
+ struct nfs4_xdr_opaque_data *ld_private = args->ld_private;
- spin_lock(&ino->i_lock);
- lo = NFS_I(ino)->layout;
- if (pnfs_seqid_is_newer(barrier, lo->plh_barrier))
- lo->plh_barrier = barrier;
- spin_unlock(&ino->i_lock);
- trace_nfs4_layoutreturn_on_close(ino, 0);
-}
-
-void pnfs_roc_get_barrier(struct inode *ino, u32 *barrier)
-{
- struct nfs_inode *nfsi = NFS_I(ino);
- struct pnfs_layout_hdr *lo;
- u32 current_seqid;
-
- spin_lock(&ino->i_lock);
- lo = nfsi->layout;
- current_seqid = be32_to_cpu(lo->plh_stateid.seqid);
-
- /* Since close does not return a layout stateid for use as
- * a barrier, we choose the worst-case barrier.
- */
- *barrier = current_seqid + atomic_read(&lo->plh_outstanding);
- spin_unlock(&ino->i_lock);
+ if (ret == 0) {
+ arg_stateid = &args->stateid;
+ if (res->lrs_present)
+ res_stateid = &res->stateid;
+ }
+ pnfs_layoutreturn_free_lsegs(lo, arg_stateid, &args->range,
+ res_stateid);
+ if (ld_private && ld_private->ops && ld_private->ops->free)
+ ld_private->ops->free(ld_private);
+ pnfs_put_layout_hdr(lo);
+ trace_nfs4_layoutreturn_on_close(args->inode, 0);
}
bool pnfs_wait_on_layoutreturn(struct inode *ino, struct rpc_task *task)
@@ -1252,13 +1359,11 @@ bool pnfs_wait_on_layoutreturn(struct inode *ino, struct rpc_task *task)
* i_lock */
spin_lock(&ino->i_lock);
lo = nfsi->layout;
- if (lo && test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags))
+ if (lo && test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) {
+ rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL);
sleep = true;
+ }
spin_unlock(&ino->i_lock);
-
- if (sleep)
- rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL);
-
return sleep;
}
@@ -1375,6 +1480,7 @@ alloc_init_layout_hdr(struct inode *ino,
atomic_set(&lo->plh_refcount, 1);
INIT_LIST_HEAD(&lo->plh_layouts);
INIT_LIST_HEAD(&lo->plh_segs);
+ INIT_LIST_HEAD(&lo->plh_return_segs);
INIT_LIST_HEAD(&lo->plh_bulk_destroy);
lo->plh_inode = ino;
lo->plh_lc_cred = get_rpccred(ctx->cred);
@@ -1841,7 +1947,10 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
goto out_forget;
}
- if (nfs4_stateid_match_other(&lo->plh_stateid, &res->stateid)) {
+ if (!pnfs_layout_is_valid(lo)) {
+ /* We have a completely new layout */
+ pnfs_set_layout_stateid(lo, &res->stateid, true);
+ } else if (nfs4_stateid_match_other(&lo->plh_stateid, &res->stateid)) {
/* existing state ID, make sure the sequence number matches. */
if (pnfs_layout_stateid_blocked(lo, &res->stateid)) {
dprintk("%s forget reply due to sequence\n", __func__);
@@ -1851,12 +1960,10 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
} else {
/*
* We got an entirely new state ID. Mark all segments for the
- * inode invalid, and don't bother validating the stateid
- * sequence number.
+ * inode invalid, and retry the layoutget
*/
pnfs_mark_layout_stateid_invalid(lo, &free_me);
-
- pnfs_set_layout_stateid(lo, &res->stateid, true);
+ goto out_forget;
}
pnfs_get_lseg(lseg);
@@ -1877,20 +1984,6 @@ out_forget:
return ERR_PTR(-EAGAIN);
}
-static void
-pnfs_set_plh_return_info(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode,
- u32 seq)
-{
- if (lo->plh_return_iomode != 0 && lo->plh_return_iomode != iomode)
- iomode = IOMODE_ANY;
- lo->plh_return_iomode = iomode;
- set_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags);
- if (seq != 0) {
- WARN_ON_ONCE(lo->plh_return_seq != 0 && lo->plh_return_seq != seq);
- lo->plh_return_seq = seq;
- }
-}
-
/**
* pnfs_mark_matching_lsegs_return - Free or return matching layout segments
* @lo: pointer to layout header
@@ -1945,17 +2038,18 @@ void pnfs_error_mark_layout_for_return(struct inode *inode,
.offset = 0,
.length = NFS4_MAX_UINT64,
};
- LIST_HEAD(free_me);
bool return_now = false;
spin_lock(&inode->i_lock);
pnfs_set_plh_return_info(lo, range.iomode, 0);
+ /* Block LAYOUTGET */
+ set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags);
/*
* mark all matching lsegs so that we are sure to have no live
* segments at hand when sending layoutreturn. See pnfs_put_lseg()
* for how it works.
*/
- if (!pnfs_mark_matching_lsegs_return(lo, &free_me, &range, 0)) {
+ if (!pnfs_mark_matching_lsegs_return(lo, &lo->plh_return_segs, &range, 0)) {
nfs4_stateid stateid;
enum pnfs_iomode iomode;
@@ -1967,7 +2061,6 @@ void pnfs_error_mark_layout_for_return(struct inode *inode,
spin_unlock(&inode->i_lock);
nfs_commit_inode(inode, 0);
}
- pnfs_free_lseg_list(&free_me);
}
EXPORT_SYMBOL_GPL(pnfs_error_mark_layout_for_return);
@@ -2063,7 +2156,7 @@ pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio,
*
*/
if (pgio->pg_lseg) {
- seg_end = end_offset(pgio->pg_lseg->pls_range.offset,
+ seg_end = pnfs_end_offset(pgio->pg_lseg->pls_range.offset,
pgio->pg_lseg->pls_range.length);
req_start = req_offset(req);
WARN_ON_ONCE(req_start >= seg_end);
@@ -2286,6 +2379,10 @@ void pnfs_read_resend_pnfs(struct nfs_pgio_header *hdr)
struct nfs_pageio_descriptor pgio;
if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
+ /* Prevent deadlocks with layoutreturn! */
+ pnfs_put_lseg(hdr->lseg);
+ hdr->lseg = NULL;
+
nfs_pageio_init_read(&pgio, hdr->inode, false,
hdr->completion_ops);
hdr->task.tk_status = nfs_pageio_resend(&pgio, hdr);
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 5c295512c967..63f77b49a586 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -96,6 +96,7 @@ enum {
NFS_LAYOUT_RW_FAILED, /* get rw layout failed stop trying */
NFS_LAYOUT_BULK_RECALL, /* bulk recall affecting layout */
NFS_LAYOUT_RETURN, /* layoutreturn in progress */
+ NFS_LAYOUT_RETURN_LOCK, /* Serialise layoutreturn */
NFS_LAYOUT_RETURN_REQUESTED, /* Return this layout ASAP */
NFS_LAYOUT_INVALID_STID, /* layout stateid id is invalid */
NFS_LAYOUT_FIRST_LAYOUTGET, /* Serialize first layoutget */
@@ -171,8 +172,8 @@ struct pnfs_layoutdriver_type {
(struct nfs_server *server, struct pnfs_device *pdev,
gfp_t gfp_flags);
- void (*encode_layoutreturn) (struct pnfs_layout_hdr *layoutid,
- struct xdr_stream *xdr,
+ int (*prepare_layoutreturn) (struct nfs4_layoutreturn_args *);
+ void (*encode_layoutreturn) (struct xdr_stream *xdr,
const struct nfs4_layoutreturn_args *args);
void (*cleanup_layoutcommit) (struct nfs4_layoutcommit_data *data);
@@ -181,7 +182,6 @@ struct pnfs_layoutdriver_type {
struct xdr_stream *xdr,
const struct nfs4_layoutcommit_args *args);
int (*prepare_layoutstats) (struct nfs42_layoutstat_args *args);
- void (*cleanup_layoutstats) (struct nfs42_layoutstat_data *data);
};
struct pnfs_layout_hdr {
@@ -190,6 +190,7 @@ struct pnfs_layout_hdr {
struct list_head plh_layouts; /* other client layouts */
struct list_head plh_bulk_destroy;
struct list_head plh_segs; /* layout segments list */
+ struct list_head plh_return_segs; /* invalid layout segments */
unsigned long plh_block_lgets; /* block LAYOUTGET if >0 */
unsigned long plh_retry_timestamp;
unsigned long plh_flags;
@@ -270,10 +271,13 @@ int pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
u32 seq);
int pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo,
struct list_head *lseg_list);
-bool pnfs_roc(struct inode *ino);
-void pnfs_roc_release(struct inode *ino);
-void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
-void pnfs_roc_get_barrier(struct inode *ino, u32 *barrier);
+bool pnfs_roc(struct inode *ino,
+ struct nfs4_layoutreturn_args *args,
+ struct nfs4_layoutreturn_res *res,
+ const struct rpc_cred *cred);
+void pnfs_roc_release(struct nfs4_layoutreturn_args *args,
+ struct nfs4_layoutreturn_res *res,
+ int ret);
bool pnfs_wait_on_layoutreturn(struct inode *ino, struct rpc_task *task);
void pnfs_set_layoutcommit(struct inode *, struct pnfs_layout_segment *, loff_t);
void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data);
@@ -292,7 +296,10 @@ struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino,
enum pnfs_iomode iomode,
bool strict_iomode,
gfp_t gfp_flags);
-void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo);
+void pnfs_layoutreturn_free_lsegs(struct pnfs_layout_hdr *lo,
+ const nfs4_stateid *arg_stateid,
+ const struct pnfs_layout_range *range,
+ const nfs4_stateid *stateid);
void pnfs_generic_layout_insert_lseg(struct pnfs_layout_hdr *lo,
struct pnfs_layout_segment *lseg,
@@ -362,8 +369,7 @@ struct nfs4_pnfs_ds *nfs4_pnfs_ds_add(struct list_head *dsaddrs,
void nfs4_pnfs_v3_ds_connect_unload(void);
void nfs4_pnfs_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds,
struct nfs4_deviceid_node *devid, unsigned int timeo,
- unsigned int retrans, u32 version, u32 minor_version,
- rpc_authflavor_t au_flavor);
+ unsigned int retrans, u32 version, u32 minor_version);
struct nfs4_pnfs_ds_addr *nfs4_decode_mp_ds_addr(struct net *net,
struct xdr_stream *xdr,
gfp_t gfp_flags);
@@ -559,6 +565,38 @@ pnfs_copy_range(struct pnfs_layout_range *dst,
memcpy(dst, src, sizeof(*dst));
}
+static inline u64
+pnfs_end_offset(u64 start, u64 len)
+{
+ if (NFS4_MAX_UINT64 - start <= len)
+ return NFS4_MAX_UINT64;
+ return start + len;
+}
+
+/*
+ * Are 2 ranges intersecting?
+ * start1 end1
+ * [----------------------------------)
+ * start2 end2
+ * [----------------)
+ */
+static inline bool
+pnfs_is_range_intersecting(u64 start1, u64 end1, u64 start2, u64 end2)
+{
+ return (end1 == NFS4_MAX_UINT64 || start2 < end1) &&
+ (end2 == NFS4_MAX_UINT64 || start1 < end2);
+}
+
+static inline bool
+pnfs_lseg_range_intersecting(const struct pnfs_layout_range *l1,
+ const struct pnfs_layout_range *l2)
+{
+ u64 end1 = pnfs_end_offset(l1->offset, l1->length);
+ u64 end2 = pnfs_end_offset(l2->offset, l2->length);
+
+ return pnfs_is_range_intersecting(l1->offset, end1, l2->offset, end2);
+}
+
extern unsigned int layoutstats_timer;
#ifdef NFS_DEBUG
@@ -630,23 +668,18 @@ pnfs_layoutcommit_outstanding(struct inode *inode)
static inline bool
-pnfs_roc(struct inode *ino)
+pnfs_roc(struct inode *ino,
+ struct nfs4_layoutreturn_args *args,
+ struct nfs4_layoutreturn_res *res,
+ const struct rpc_cred *cred)
{
return false;
}
static inline void
-pnfs_roc_release(struct inode *ino)
-{
-}
-
-static inline void
-pnfs_roc_set_barrier(struct inode *ino, u32 barrier)
-{
-}
-
-static inline void
-pnfs_roc_get_barrier(struct inode *ino, u32 *barrier)
+pnfs_roc_release(struct nfs4_layoutreturn_args *args,
+ struct nfs4_layoutreturn_res *res,
+ int ret)
{
}
diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
index 53b4705abcc7..9414b492439f 100644
--- a/fs/nfs/pnfs_nfs.c
+++ b/fs/nfs/pnfs_nfs.c
@@ -600,8 +600,7 @@ static struct nfs_client *(*get_v3_ds_connect)(
int ds_addrlen,
int ds_proto,
unsigned int ds_timeo,
- unsigned int ds_retrans,
- rpc_authflavor_t au_flavor);
+ unsigned int ds_retrans);
static bool load_v3_ds_connect(void)
{
@@ -625,15 +624,13 @@ EXPORT_SYMBOL_GPL(nfs4_pnfs_v3_ds_connect_unload);
static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv,
struct nfs4_pnfs_ds *ds,
unsigned int timeo,
- unsigned int retrans,
- rpc_authflavor_t au_flavor)
+ unsigned int retrans)
{
struct nfs_client *clp = ERR_PTR(-EIO);
struct nfs4_pnfs_ds_addr *da;
int status = 0;
- dprintk("--> %s DS %s au_flavor %d\n", __func__,
- ds->ds_remotestr, au_flavor);
+ dprintk("--> %s DS %s\n", __func__, ds->ds_remotestr);
if (!load_v3_ds_connect())
goto out;
@@ -657,7 +654,7 @@ static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv,
clp = get_v3_ds_connect(mds_srv,
(struct sockaddr *)&da->da_addr,
da->da_addrlen, IPPROTO_TCP,
- timeo, retrans, au_flavor);
+ timeo, retrans);
}
if (IS_ERR(clp)) {
@@ -676,15 +673,13 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv,
struct nfs4_pnfs_ds *ds,
unsigned int timeo,
unsigned int retrans,
- u32 minor_version,
- rpc_authflavor_t au_flavor)
+ u32 minor_version)
{
struct nfs_client *clp = ERR_PTR(-EIO);
struct nfs4_pnfs_ds_addr *da;
int status = 0;
- dprintk("--> %s DS %s au_flavor %d\n", __func__, ds->ds_remotestr,
- au_flavor);
+ dprintk("--> %s DS %s\n", __func__, ds->ds_remotestr);
list_for_each_entry(da, &ds->ds_addrs, da_node) {
dprintk("%s: DS %s: trying address %s\n",
@@ -720,8 +715,7 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv,
clp = nfs4_set_ds_client(mds_srv,
(struct sockaddr *)&da->da_addr,
da->da_addrlen, IPPROTO_TCP,
- timeo, retrans, minor_version,
- au_flavor);
+ timeo, retrans, minor_version);
if (IS_ERR(clp))
continue;
@@ -755,19 +749,17 @@ out:
*/
void nfs4_pnfs_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds,
struct nfs4_deviceid_node *devid, unsigned int timeo,
- unsigned int retrans, u32 version,
- u32 minor_version, rpc_authflavor_t au_flavor)
+ unsigned int retrans, u32 version, u32 minor_version)
{
if (test_and_set_bit(NFS4DS_CONNECTING, &ds->ds_state) == 0) {
int err = 0;
if (version == 3) {
err = _nfs4_pnfs_v3_ds_connect(mds_srv, ds, timeo,
- retrans, au_flavor);
+ retrans);
} else if (version == 4) {
err = _nfs4_pnfs_v4_ds_connect(mds_srv, ds, timeo,
- retrans, minor_version,
- au_flavor);
+ retrans, minor_version);
} else {
dprintk("%s: unsupported DS version %d\n", __func__,
version);
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 001796bcd6c8..ddce94ce8142 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -2904,7 +2904,7 @@ module_param(max_session_slots, ushort, 0644);
MODULE_PARM_DESC(max_session_slots, "Maximum number of outstanding NFSv4.1 "
"requests the client will negotiate");
module_param(max_session_cb_slots, ushort, 0644);
-MODULE_PARM_DESC(max_session_slots, "Maximum number of parallel NFSv4.1 "
+MODULE_PARM_DESC(max_session_cb_slots, "Maximum number of parallel NFSv4.1 "
"callbacks the client will process for a given server");
module_param(send_implementation_id, ushort, 0644);
MODULE_PARM_DESC(send_implementation_id,
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 53211838f72a..6e761f3f4cbf 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1151,8 +1151,7 @@ int nfs_flush_incompatible(struct file *file, struct page *page)
if (l_ctx && flctx &&
!(list_empty_careful(&flctx->flc_posix) &&
list_empty_careful(&flctx->flc_flock))) {
- do_flush |= l_ctx->lockowner.l_owner != current->files
- || l_ctx->lockowner.l_pid != current->tgid;
+ do_flush |= l_ctx->lockowner != current->files;
}
nfs_release_request(req);
if (!do_flush)
diff --git a/fs/nfs_common/grace.c b/fs/nfs_common/grace.c
index fd8c9a5bcac4..420d3a0ab258 100644
--- a/fs/nfs_common/grace.c
+++ b/fs/nfs_common/grace.c
@@ -9,7 +9,7 @@
#include <net/netns/generic.h>
#include <linux/fs.h>
-static int grace_net_id;
+static unsigned int grace_net_id;
static DEFINE_SPINLOCK(grace_lock);
/**
diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
index ee36efd5aece..3714231a9d0f 100644
--- a/fs/nfsd/netns.h
+++ b/fs/nfsd/netns.h
@@ -124,5 +124,5 @@ struct nfsd_net {
/* Simple check to find out if a given net was properly initialized */
#define nfsd_netns_ready(nn) ((nn)->sessionid_hashtbl)
-extern int nfsd_net_id;
+extern unsigned int nfsd_net_id;
#endif /* __NFSD_NETNS_H__ */
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 36b2af931e06..2857e46d5cc5 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -1201,7 +1201,7 @@ static int create_proc_exports_entry(void)
}
#endif
-int nfsd_net_id;
+unsigned int nfsd_net_id;
static __net_init int nfsd_init_net(struct net *net)
{
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index c95d369e90aa..12eeae62a2b1 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -189,7 +189,7 @@ static int nilfs_sync_super(struct super_block *sb, int flag)
set_buffer_dirty(nilfs->ns_sbh[0]);
if (nilfs_test_opt(nilfs, BARRIER)) {
err = __sync_dirty_buffer(nilfs->ns_sbh[0],
- WRITE_SYNC | WRITE_FLUSH_FUA);
+ REQ_SYNC | REQ_PREFLUSH | REQ_FUA);
} else {
err = sync_dirty_buffer(nilfs->ns_sbh[0]);
}
diff --git a/fs/nsfs.c b/fs/nsfs.c
index 8718af895eab..8c9fb29c6673 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -118,7 +118,7 @@ again:
return ret;
}
-static int open_related_ns(struct ns_common *ns,
+int open_related_ns(struct ns_common *ns,
struct ns_common *(*get_ns)(struct ns_common *ns))
{
struct path path = {};
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index fe251f187ff8..cc91856b5e2d 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -29,6 +29,7 @@
#include <linux/buffer_head.h>
#include <linux/writeback.h>
#include <linux/bit_spinlock.h>
+#include <linux/bio.h>
#include "aops.h"
#include "attrib.h"
@@ -764,7 +765,7 @@ lock_retry_remap:
}
// TODO: Instantiate the hole.
// clear_buffer_new(bh);
- // unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
+ // clean_bdev_bh_alias(bh);
ntfs_error(vol->sb, "Writing into sparse regions is "
"not supported yet. Sorry.");
err = -EOPNOTSUPP;
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index bf72a2c58b75..99510d811a8c 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -740,8 +740,7 @@ map_buffer_cached:
set_buffer_uptodate(bh);
if (unlikely(was_hole)) {
/* We allocated the buffer. */
- unmap_underlying_metadata(bh->b_bdev,
- bh->b_blocknr);
+ clean_bdev_bh_alias(bh);
if (bh_end <= pos || bh_pos >= end)
mark_buffer_dirty(bh);
else
@@ -784,7 +783,7 @@ map_buffer_cached:
continue;
}
/* We allocated the buffer. */
- unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
+ clean_bdev_bh_alias(bh);
/*
* If the buffer is fully outside the write, zero it,
* set it uptodate, and mark it dirty so it gets
diff --git a/fs/ntfs/logfile.c b/fs/ntfs/logfile.c
index 761f12f7f3ef..353379ff6057 100644
--- a/fs/ntfs/logfile.c
+++ b/fs/ntfs/logfile.c
@@ -27,6 +27,7 @@
#include <linux/buffer_head.h>
#include <linux/bitops.h>
#include <linux/log2.h>
+#include <linux/bio.h>
#include "attrib.h"
#include "aops.h"
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index d3c009626032..b6f402194f02 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -23,6 +23,7 @@
#include <linux/buffer_head.h>
#include <linux/slab.h>
#include <linux/swap.h>
+#include <linux/bio.h>
#include "attrib.h"
#include "aops.h"
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index c5c5b9748ea3..4d9c6f5ec28a 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -630,7 +630,7 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
if (!buffer_mapped(bh)) {
map_bh(bh, inode->i_sb, *p_blkno);
- unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
+ clean_bdev_bh_alias(bh);
}
if (PageUptodate(page)) {
@@ -1950,8 +1950,7 @@ static void ocfs2_write_end_inline(struct inode *inode, loff_t pos,
}
int ocfs2_write_end_nolock(struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
+ loff_t pos, unsigned len, unsigned copied, void *fsdata)
{
int i, ret;
unsigned from, to, start = pos & (PAGE_SIZE - 1);
@@ -2064,7 +2063,7 @@ static int ocfs2_write_end(struct file *file, struct address_space *mapping,
int ret;
struct inode *inode = mapping->host;
- ret = ocfs2_write_end_nolock(mapping, pos, len, copied, page, fsdata);
+ ret = ocfs2_write_end_nolock(mapping, pos, len, copied, fsdata);
up_write(&OCFS2_I(inode)->ip_alloc_sem);
ocfs2_inode_unlock(inode, 1);
@@ -2241,7 +2240,7 @@ static int ocfs2_dio_get_block(struct inode *inode, sector_t iblock,
dwc->dw_zero_count++;
}
- ret = ocfs2_write_end_nolock(inode->i_mapping, pos, len, len, NULL, wc);
+ ret = ocfs2_write_end_nolock(inode->i_mapping, pos, len, len, wc);
BUG_ON(ret != len);
ret = 0;
unlock:
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index b1c9f28a57b1..8614ff069d99 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -44,8 +44,7 @@ int walk_page_buffers( handle_t *handle,
struct buffer_head *bh));
int ocfs2_write_end_nolock(struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata);
+ loff_t pos, unsigned len, unsigned copied, void *fsdata);
typedef enum {
OCFS2_WRITE_BUFFER = 0,
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index 8f040f88ade4..d9ebe11c8990 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -26,6 +26,7 @@
#include <linux/fs.h>
#include <linux/types.h>
#include <linux/highmem.h>
+#include <linux/bio.h>
#include <cluster/masklog.h>
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 636abcbd4650..96a155ab5059 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -627,7 +627,7 @@ static int o2hb_issue_node_write(struct o2hb_region *reg,
slot = o2nm_this_node();
bio = o2hb_setup_one_bio(reg, write_wc, &slot, slot+1, REQ_OP_WRITE,
- WRITE_SYNC);
+ REQ_SYNC);
if (IS_ERR(bio)) {
status = PTR_ERR(bio);
mlog_errno(status);
@@ -741,7 +741,7 @@ static inline void o2hb_prepare_block(struct o2hb_region *reg,
hb_block = (struct o2hb_disk_heartbeat_block *)slot->ds_raw_block;
memset(hb_block, 0, reg->hr_block_bytes);
/* TODO: time stuff */
- cputime = CURRENT_TIME.tv_sec;
+ cputime = ktime_get_real_seconds();
if (!cputime)
cputime = 1;
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 3f828a187049..a464c8088170 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -1609,8 +1609,6 @@ way_up_top:
__dlm_insert_mle(dlm, mle);
response = DLM_MASTER_RESP_NO;
} else {
- // mlog(0, "mle was found\n");
- set_maybe = 1;
spin_lock(&tmpmle->spinlock);
if (tmpmle->master == dlm->node_num) {
mlog(ML_ERROR, "no lockres, but an mle with this node as master!\n");
@@ -1625,8 +1623,7 @@ way_up_top:
response = DLM_MASTER_RESP_NO;
} else
response = DLM_MASTER_RESP_MAYBE;
- if (set_maybe)
- set_bit(request->node_idx, tmpmle->maybe_map);
+ set_bit(request->node_idx, tmpmle->maybe_map);
spin_unlock(&tmpmle->spinlock);
}
spin_unlock(&dlm->master_lock);
@@ -1644,12 +1641,6 @@ send_response:
* dlm_assert_master_worker() isn't called, we drop it here.
*/
if (dispatch_assert) {
- if (response != DLM_MASTER_RESP_YES)
- mlog(ML_ERROR, "invalid response %d\n", response);
- if (!res) {
- mlog(ML_ERROR, "bad lockres while trying to assert!\n");
- BUG();
- }
mlog(0, "%u is the owner of %.*s, cleaning everyone else\n",
dlm->node_num, res->lockname.len, res->lockname.name);
spin_lock(&res->spinlock);
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index dd5cb8bcefd1..74407c6dd592 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -2966,8 +2966,6 @@ int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data,
spin_unlock(&dlm->spinlock);
dlm_kick_recovery_thread(dlm);
break;
- default:
- BUG();
}
mlog(0, "%s: recovery done, reco master was %u, dead now %u, master now %u\n",
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index c56a7679df93..382401d3e88f 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -703,7 +703,7 @@ static int ocfs2_remove_inode(struct inode *inode,
goto bail_commit;
}
- di->i_dtime = cpu_to_le64(CURRENT_TIME.tv_sec);
+ di->i_dtime = cpu_to_le64(ktime_get_real_seconds());
di->i_flags &= cpu_to_le32(~(OCFS2_VALID_FL | OCFS2_ORPHANED_FL));
ocfs2_journal_dirty(handle, di_bh);
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index a244f14c6b87..d5e5fa7f0743 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -1947,7 +1947,7 @@ static void ocfs2_queue_orphan_scan(struct ocfs2_super *osb)
*/
seqno++;
os->os_count++;
- os->os_scantime = CURRENT_TIME;
+ os->os_scantime = ktime_get_seconds();
unlock:
ocfs2_orphan_scan_unlock(osb, seqno);
out:
@@ -2004,7 +2004,7 @@ void ocfs2_orphan_scan_start(struct ocfs2_super *osb)
struct ocfs2_orphan_scan *os;
os = &osb->osb_orphan_scan;
- os->os_scantime = CURRENT_TIME;
+ os->os_scantime = ktime_get_seconds();
if (ocfs2_is_hard_readonly(osb) || ocfs2_mount_local(osb))
atomic_set(&os->os_state, ORPHAN_SCAN_INACTIVE);
else {
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 71545ad4628c..429088786e93 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -120,8 +120,7 @@ static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh,
ret = VM_FAULT_NOPAGE;
goto out;
}
- ret = ocfs2_write_end_nolock(mapping, pos, len, len, locked_page,
- fsdata);
+ ret = ocfs2_write_end_nolock(mapping, pos, len, len, fsdata);
BUG_ON(ret != len);
ret = VM_FAULT_LOCKED;
out:
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 8d887c75765c..3b0a10d9b36f 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -516,6 +516,7 @@ static int __ocfs2_mknod_locked(struct inode *dir,
struct ocfs2_extent_list *fel;
u16 feat;
struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct timespec64 ts;
*new_fe_bh = NULL;
@@ -564,10 +565,11 @@ static int __ocfs2_mknod_locked(struct inode *dir,
fe->i_last_eb_blk = 0;
strcpy(fe->i_signature, OCFS2_INODE_SIGNATURE);
fe->i_flags |= cpu_to_le32(OCFS2_VALID_FL);
+ ktime_get_real_ts64(&ts);
fe->i_atime = fe->i_ctime = fe->i_mtime =
- cpu_to_le64(CURRENT_TIME.tv_sec);
+ cpu_to_le64(ts.tv_sec);
fe->i_mtime_nsec = fe->i_ctime_nsec = fe->i_atime_nsec =
- cpu_to_le32(CURRENT_TIME.tv_nsec);
+ cpu_to_le32(ts.tv_nsec);
fe->i_dtime = 0;
/*
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index e63af7ddfe68..7e5958b0be6b 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -224,7 +224,7 @@ struct ocfs2_orphan_scan {
struct ocfs2_super *os_osb;
struct ocfs2_lock_res os_lockres; /* lock to synchronize scans */
struct delayed_work os_orphan_scan_work;
- struct timespec os_scantime; /* time this node ran the scan */
+ time64_t os_scantime; /* time this node ran the scan */
u32 os_count; /* tracks node specific scans */
u32 os_seqno; /* tracks cluster wide scans */
atomic_t os_state; /* ACTIVE or INACTIVE */
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 19238512a324..738b4ea8e990 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -478,7 +478,6 @@ again:
if (ret) {
mlog_errno(ret);
ocfs2_unlock_refcount_tree(osb, tree, rw);
- ocfs2_refcount_tree_put(tree);
goto out;
}
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index f56fe39fab04..c894d945b084 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -337,7 +337,7 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len)
out += snprintf(buf + out, len - out, "Disabled\n");
else
out += snprintf(buf + out, len - out, "%lu seconds ago\n",
- (get_seconds() - os->os_scantime.tv_sec));
+ (unsigned long)(ktime_get_seconds() - os->os_scantime));
out += snprintf(buf + out, len - out, "%10s => %3s %10s\n",
"Slots", "Num", "RecoGen");
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index ef3b4eb54cf2..551bc74ed2b8 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -8,6 +8,7 @@
* Linux VFS inode operations.
*/
+#include <linux/bvec.h>
#include "protocol.h"
#include "orangefs-kernel.h"
#include "orangefs-bufmap.h"
diff --git a/fs/orangefs/orangefs-debugfs.c b/fs/orangefs/orangefs-debugfs.c
index f1e824979aad..27e75cf28b3a 100644
--- a/fs/orangefs/orangefs-debugfs.c
+++ b/fs/orangefs/orangefs-debugfs.c
@@ -673,8 +673,10 @@ int orangefs_prepare_debugfs_help_string(int at_boot)
*/
cdm_element_count =
orangefs_prepare_cdm_array(client_debug_array_string);
- if (cdm_element_count <= 0)
+ if (cdm_element_count <= 0) {
+ kfree(new);
goto out;
+ }
for (i = 0; i < cdm_element_count; i++) {
strlcat(new, "\t", string_size);
diff --git a/fs/orangefs/orangefs-sysfs.c b/fs/orangefs/orangefs-sysfs.c
index a799546a67f7..084954448f18 100644
--- a/fs/orangefs/orangefs-sysfs.c
+++ b/fs/orangefs/orangefs-sysfs.c
@@ -609,15 +609,6 @@ static ssize_t sysfs_service_op_store(struct kobject *kobj,
new_op->upcall.req.param.u.value32[0] = val1;
new_op->upcall.req.param.u.value32[1] = val2;
goto value_set;
- } else if (!strcmp(attr->attr.name,
- "perf_counter_reset")) {
- if ((val > 0)) {
- new_op->upcall.req.param.op =
- ORANGEFS_PARAM_REQUEST_OP_READAHEAD_COUNT_SIZE;
- } else {
- rc = 0;
- goto out;
- }
}
} else if (!strcmp(kobj->name, ACACHE_KOBJ_ID)) {
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index edd46a0e951d..0e100856c7b8 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -328,11 +328,11 @@ static struct dentry *ovl_d_real(struct dentry *dentry,
if (!real)
goto bug;
+ /* Handle recursion */
+ real = d_real(real, inode, open_flags);
+
if (!inode || inode == d_inode(real))
return real;
-
- /* Handle recursion */
- return d_real(real, inode, open_flags);
bug:
WARN(1, "ovl_d_real(%pd4, %s:%lu): real dentry not found\n", dentry,
inode ? inode->i_sb->s_id : "NULL", inode ? inode->i_ino : 0);
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 81818adb8e9e..51a4213afa2e 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -245,7 +245,7 @@ void render_sigset_t(struct seq_file *m, const char *header,
if (sigismember(set, i+2)) x |= 2;
if (sigismember(set, i+3)) x |= 4;
if (sigismember(set, i+4)) x |= 8;
- seq_printf(m, "%x", x);
+ seq_putc(m, hex_asc[x]);
} while (i >= 4);
seq_putc(m, '\n');
@@ -342,10 +342,11 @@ static inline void task_cap(struct seq_file *m, struct task_struct *p)
static inline void task_seccomp(struct seq_file *m, struct task_struct *p)
{
+ seq_put_decimal_ull(m, "NoNewPrivs:\t", task_no_new_privs(p));
#ifdef CONFIG_SECCOMP
- seq_put_decimal_ull(m, "Seccomp:\t", p->seccomp.mode);
- seq_putc(m, '\n');
+ seq_put_decimal_ull(m, "\nSeccomp:\t", p->seccomp.mode);
#endif
+ seq_putc(m, '\n');
}
static inline void task_context_switch_counts(struct seq_file *m,
diff --git a/fs/proc/base.c b/fs/proc/base.c
index ca651ac00660..5ea836362870 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -104,9 +104,12 @@
* in /proc for a task before it execs a suid executable.
*/
+static u8 nlink_tid;
+static u8 nlink_tgid;
+
struct pid_entry {
const char *name;
- int len;
+ unsigned int len;
umode_t mode;
const struct inode_operations *iop;
const struct file_operations *fop;
@@ -139,13 +142,13 @@ struct pid_entry {
* Count the number of hardlinks for the pid_entry table, excluding the .
* and .. links.
*/
-static unsigned int pid_entry_count_dirs(const struct pid_entry *entries,
+static unsigned int __init pid_entry_nlink(const struct pid_entry *entries,
unsigned int n)
{
unsigned int i;
unsigned int count;
- count = 0;
+ count = 2;
for (i = 0; i < n; ++i) {
if (S_ISDIR(entries[i].mode))
++count;
@@ -1243,7 +1246,7 @@ static const struct file_operations proc_oom_score_adj_operations = {
};
#ifdef CONFIG_AUDITSYSCALL
-#define TMPBUFLEN 21
+#define TMPBUFLEN 11
static ssize_t proc_loginuid_read(struct file * file, char __user * buf,
size_t count, loff_t *ppos)
{
@@ -1664,7 +1667,8 @@ const struct inode_operations proc_pid_link_inode_operations = {
/* building an inode */
-struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task)
+struct inode *proc_pid_make_inode(struct super_block * sb,
+ struct task_struct *task, umode_t mode)
{
struct inode * inode;
struct proc_inode *ei;
@@ -1678,6 +1682,7 @@ struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *t
/* Common stuff */
ei = PROC_I(inode);
+ inode->i_mode = mode;
inode->i_ino = get_next_ino();
inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
inode->i_op = &proc_def_inode_operations;
@@ -1967,7 +1972,7 @@ out:
struct map_files_info {
fmode_t mode;
- unsigned long len;
+ unsigned int len;
unsigned char name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */
};
@@ -2004,7 +2009,9 @@ proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
struct proc_inode *ei;
struct inode *inode;
- inode = proc_pid_make_inode(dir->i_sb, task);
+ inode = proc_pid_make_inode(dir->i_sb, task, S_IFLNK |
+ ((mode & FMODE_READ ) ? S_IRUSR : 0) |
+ ((mode & FMODE_WRITE) ? S_IWUSR : 0));
if (!inode)
return -ENOENT;
@@ -2013,12 +2020,6 @@ proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
inode->i_op = &proc_map_files_link_inode_operations;
inode->i_size = 64;
- inode->i_mode = S_IFLNK;
-
- if (mode & FMODE_READ)
- inode->i_mode |= S_IRUSR;
- if (mode & FMODE_WRITE)
- inode->i_mode |= S_IWUSR;
d_set_d_op(dentry, &tid_map_files_dentry_operations);
d_add(dentry, inode);
@@ -2372,12 +2373,11 @@ static int proc_pident_instantiate(struct inode *dir,
struct inode *inode;
struct proc_inode *ei;
- inode = proc_pid_make_inode(dir->i_sb, task);
+ inode = proc_pid_make_inode(dir->i_sb, task, p->mode);
if (!inode)
goto out;
ei = PROC_I(inode);
- inode->i_mode = p->mode;
if (S_ISDIR(inode->i_mode))
set_nlink(inode, 2); /* Use getattr to fix if necessary */
if (p->iop)
@@ -2412,14 +2412,14 @@ static struct dentry *proc_pident_lookup(struct inode *dir,
* Yes, it does not scale. And it should not. Don't add
* new entries into /proc/<tgid>/ without very good reasons.
*/
- last = &ents[nents - 1];
- for (p = ents; p <= last; p++) {
+ last = &ents[nents];
+ for (p = ents; p < last; p++) {
if (p->len != dentry->d_name.len)
continue;
if (!memcmp(dentry->d_name.name, p->name, p->len))
break;
}
- if (p > last)
+ if (p >= last)
goto out;
error = proc_pident_instantiate(dir, dentry, task, p);
@@ -2444,7 +2444,7 @@ static int proc_pident_readdir(struct file *file, struct dir_context *ctx,
if (ctx->pos >= nents + 2)
goto out;
- for (p = ents + (ctx->pos - 2); p <= ents + nents - 1; p++) {
+ for (p = ents + (ctx->pos - 2); p < ents + nents; p++) {
if (!proc_fill_cache(file, ctx, p->name, p->len,
proc_pident_instantiate, task, p))
break;
@@ -3059,17 +3059,15 @@ static int proc_pid_instantiate(struct inode *dir,
{
struct inode *inode;
- inode = proc_pid_make_inode(dir->i_sb, task);
+ inode = proc_pid_make_inode(dir->i_sb, task, S_IFDIR | S_IRUGO | S_IXUGO);
if (!inode)
goto out;
- inode->i_mode = S_IFDIR|S_IRUGO|S_IXUGO;
inode->i_op = &proc_tgid_base_inode_operations;
inode->i_fop = &proc_tgid_base_operations;
inode->i_flags|=S_IMMUTABLE;
- set_nlink(inode, 2 + pid_entry_count_dirs(tgid_base_stuff,
- ARRAY_SIZE(tgid_base_stuff)));
+ set_nlink(inode, nlink_tgid);
d_set_d_op(dentry, &pid_dentry_operations);
@@ -3352,17 +3350,15 @@ static int proc_task_instantiate(struct inode *dir,
struct dentry *dentry, struct task_struct *task, const void *ptr)
{
struct inode *inode;
- inode = proc_pid_make_inode(dir->i_sb, task);
+ inode = proc_pid_make_inode(dir->i_sb, task, S_IFDIR | S_IRUGO | S_IXUGO);
if (!inode)
goto out;
- inode->i_mode = S_IFDIR|S_IRUGO|S_IXUGO;
inode->i_op = &proc_tid_base_inode_operations;
inode->i_fop = &proc_tid_base_operations;
inode->i_flags|=S_IMMUTABLE;
- set_nlink(inode, 2 + pid_entry_count_dirs(tid_base_stuff,
- ARRAY_SIZE(tid_base_stuff)));
+ set_nlink(inode, nlink_tid);
d_set_d_op(dentry, &pid_dentry_operations);
@@ -3552,3 +3548,9 @@ static const struct file_operations proc_task_operations = {
.iterate_shared = proc_task_readdir,
.llseek = generic_file_llseek,
};
+
+void __init set_proc_pid_nlink(void)
+{
+ nlink_tid = pid_entry_nlink(tid_base_stuff, ARRAY_SIZE(tid_base_stuff));
+ nlink_tgid = pid_entry_nlink(tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff));
+}
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index d21dafef3102..4274f83bf100 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -183,14 +183,13 @@ proc_fd_instantiate(struct inode *dir, struct dentry *dentry,
struct proc_inode *ei;
struct inode *inode;
- inode = proc_pid_make_inode(dir->i_sb, task);
+ inode = proc_pid_make_inode(dir->i_sb, task, S_IFLNK);
if (!inode)
goto out;
ei = PROC_I(inode);
ei->fd = fd;
- inode->i_mode = S_IFLNK;
inode->i_op = &proc_pid_link_inode_operations;
inode->i_size = 64;
@@ -322,14 +321,13 @@ proc_fdinfo_instantiate(struct inode *dir, struct dentry *dentry,
struct proc_inode *ei;
struct inode *inode;
- inode = proc_pid_make_inode(dir->i_sb, task);
+ inode = proc_pid_make_inode(dir->i_sb, task, S_IFREG | S_IRUSR);
if (!inode)
goto out;
ei = PROC_I(inode);
ei->fd = fd;
- inode->i_mode = S_IFREG | S_IRUSR;
inode->i_fop = &proc_fdinfo_file_operations;
d_set_d_op(dentry, &tid_fd_dentry_operations);
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 5f2dc2032c79..7eb3cefcf2a3 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -479,6 +479,7 @@ struct proc_dir_entry *proc_create_mount_point(const char *name)
}
return ent;
}
+EXPORT_SYMBOL(proc_create_mount_point);
struct proc_dir_entry *proc_create_data(const char *name, umode_t mode,
struct proc_dir_entry *parent,
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index e69ebe648a34..783bc19644d1 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -138,6 +138,16 @@ static void unuse_pde(struct proc_dir_entry *pde)
/* pde is locked */
static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo)
{
+ /*
+ * close() (proc_reg_release()) can't delete an entry and proceed:
+ * ->release hook needs to be available at the right moment.
+ *
+ * rmmod (remove_proc_entry() et al) can't delete an entry and proceed:
+ * "struct file" needs to be available at the right moment.
+ *
+ * Therefore, first process to enter this function does ->release() and
+ * signals its completion to the other process which does nothing.
+ */
if (pdeo->closing) {
/* somebody else is doing that, just wait */
DECLARE_COMPLETION_ONSTACK(c);
@@ -147,12 +157,13 @@ static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo)
spin_lock(&pde->pde_unload_lock);
} else {
struct file *file;
- pdeo->closing = 1;
+ pdeo->closing = true;
spin_unlock(&pde->pde_unload_lock);
file = pdeo->file;
pde->proc_fops->release(file_inode(file), file);
spin_lock(&pde->pde_unload_lock);
- list_del_init(&pdeo->lh);
+ /* After ->release. */
+ list_del(&pdeo->lh);
if (pdeo->c)
complete(pdeo->c);
kfree(pdeo);
@@ -167,6 +178,8 @@ void proc_entry_rundown(struct proc_dir_entry *de)
if (atomic_add_return(BIAS, &de->in_use) != BIAS)
wait_for_completion(&c);
+ /* ->pde_openers list can't grow from now on. */
+
spin_lock(&de->pde_unload_lock);
while (!list_empty(&de->pde_openers)) {
struct pde_opener *pdeo;
@@ -312,16 +325,17 @@ static int proc_reg_open(struct inode *inode, struct file *file)
struct pde_opener *pdeo;
/*
- * What for, you ask? Well, we can have open, rmmod, remove_proc_entry
- * sequence. ->release won't be called because ->proc_fops will be
- * cleared. Depending on complexity of ->release, consequences vary.
+ * Ensure that
+ * 1) PDE's ->release hook will be called no matter what
+ * either normally by close()/->release, or forcefully by
+ * rmmod/remove_proc_entry.
+ *
+ * 2) rmmod isn't blocked by opening file in /proc and sitting on
+ * the descriptor (including "rmmod foo </proc/foo" scenario).
*
- * We can't wait for mercy when close will be done for real, it's
- * deadlockable: rmmod foo </proc/foo . So, we're going to do ->release
- * by hand in remove_proc_entry(). For this, save opener's credentials
- * for later.
+ * Save every "struct file" with custom ->release hook.
*/
- pdeo = kzalloc(sizeof(struct pde_opener), GFP_KERNEL);
+ pdeo = kmalloc(sizeof(struct pde_opener), GFP_KERNEL);
if (!pdeo)
return -ENOMEM;
@@ -338,7 +352,8 @@ static int proc_reg_open(struct inode *inode, struct file *file)
if (rv == 0 && release) {
/* To know what to release. */
pdeo->file = file;
- /* Strictly for "too late" ->release in proc_reg_release(). */
+ pdeo->closing = false;
+ pdeo->c = NULL;
spin_lock(&pde->pde_unload_lock);
list_add(&pdeo->lh, &pde->pde_openers);
spin_unlock(&pde->pde_unload_lock);
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 5378441ec1b7..2de5194ba378 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -162,7 +162,7 @@ extern int proc_pid_statm(struct seq_file *, struct pid_namespace *,
extern const struct dentry_operations pid_dentry_operations;
extern int pid_getattr(struct vfsmount *, struct dentry *, struct kstat *);
extern int proc_setattr(struct dentry *, struct iattr *);
-extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struct *);
+extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struct *, umode_t);
extern int pid_revalidate(struct dentry *, unsigned int);
extern int pid_delete_dentry(const struct dentry *);
extern int proc_pid_readdir(struct file *, struct dir_context *);
@@ -195,7 +195,6 @@ static inline bool is_empty_pde(const struct proc_dir_entry *pde)
{
return S_ISDIR(pde->mode) && !pde->proc_iops;
}
-struct proc_dir_entry *proc_create_mount_point(const char *name);
/*
* inode.c
@@ -203,7 +202,7 @@ struct proc_dir_entry *proc_create_mount_point(const char *name);
struct pde_opener {
struct file *file;
struct list_head lh;
- int closing;
+ bool closing;
struct completion *c;
};
extern const struct inode_operations proc_link_inode_operations;
@@ -211,6 +210,7 @@ extern const struct inode_operations proc_link_inode_operations;
extern const struct inode_operations proc_pid_link_inode_operations;
extern void proc_init_inodecache(void);
+void set_proc_pid_nlink(void);
extern struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *);
extern int proc_fill_super(struct super_block *, void *data, int flags);
extern void proc_entry_rundown(struct proc_dir_entry *);
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index 51b8b0a8ad91..766f0c637ad1 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -92,12 +92,11 @@ static int proc_ns_instantiate(struct inode *dir,
struct inode *inode;
struct proc_inode *ei;
- inode = proc_pid_make_inode(dir->i_sb, task);
+ inode = proc_pid_make_inode(dir->i_sb, task, S_IFLNK | S_IRWXUGO);
if (!inode)
goto out;
ei = PROC_I(inode);
- inode->i_mode = S_IFLNK|S_IRWXUGO;
inode->i_op = &proc_ns_link_inode_operations;
ei->ns_ops = ns_ops;
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 8d3e484055a6..4bd0373576b5 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -122,6 +122,7 @@ void __init proc_root_init(void)
int err;
proc_init_inodecache();
+ set_proc_pid_nlink();
err = register_filesystem(&proc_fs_type);
if (err)
return;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 35b92d81692f..958f32545064 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1588,6 +1588,7 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
} while (pte++, addr += PAGE_SIZE, addr != end);
pte_unmap_unlock(orig_pte, ptl);
+ cond_resched();
return 0;
}
#ifdef CONFIG_HUGETLB_PAGE
diff --git a/fs/pstore/Kconfig b/fs/pstore/Kconfig
index be40813eff52..b42e5bd6d8ff 100644
--- a/fs/pstore/Kconfig
+++ b/fs/pstore/Kconfig
@@ -86,4 +86,4 @@ config PSTORE_RAM
Note that for historical reasons, the module will be named
"ramoops.ko".
- For more information, see Documentation/ramoops.txt.
+ For more information, see Documentation/admin-guide/ramoops.rst.
diff --git a/fs/pstore/ftrace.c b/fs/pstore/ftrace.c
index d4887705bb61..899d0ba0bd6c 100644
--- a/fs/pstore/ftrace.c
+++ b/fs/pstore/ftrace.c
@@ -27,6 +27,9 @@
#include <asm/barrier.h>
#include "internal.h"
+/* This doesn't need to be atomic: speed is chosen over correctness here. */
+static u64 pstore_ftrace_stamp;
+
static void notrace pstore_ftrace_call(unsigned long ip,
unsigned long parent_ip,
struct ftrace_ops *op,
@@ -42,6 +45,7 @@ static void notrace pstore_ftrace_call(unsigned long ip,
rec.ip = ip;
rec.parent_ip = parent_ip;
+ pstore_ftrace_write_timestamp(&rec, pstore_ftrace_stamp++);
pstore_ftrace_encode_cpu(&rec, raw_smp_processor_id());
psinfo->write_buf(PSTORE_TYPE_FTRACE, 0, NULL, 0, (void *)&rec,
0, sizeof(rec), psinfo);
@@ -71,10 +75,13 @@ static ssize_t pstore_ftrace_knob_write(struct file *f, const char __user *buf,
if (!on ^ pstore_ftrace_enabled)
goto out;
- if (on)
+ if (on) {
+ ftrace_ops_set_global_filter(&pstore_ftrace_ops);
ret = register_ftrace_function(&pstore_ftrace_ops);
- else
+ } else {
ret = unregister_ftrace_function(&pstore_ftrace_ops);
+ }
+
if (ret) {
pr_err("%s: unable to %sregister ftrace ops: %zd\n",
__func__, on ? "" : "un", ret);
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index 1781dc50762e..57c0646479f5 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -107,9 +107,11 @@ static int pstore_ftrace_seq_show(struct seq_file *s, void *v)
struct pstore_ftrace_seq_data *data = v;
struct pstore_ftrace_record *rec = (void *)(ps->data + data->off);
- seq_printf(s, "%d %08lx %08lx %pf <- %pF\n",
- pstore_ftrace_decode_cpu(rec), rec->ip, rec->parent_ip,
- (void *)rec->ip, (void *)rec->parent_ip);
+ seq_printf(s, "CPU:%d ts:%llu %08lx %08lx %pf <- %pF\n",
+ pstore_ftrace_decode_cpu(rec),
+ pstore_ftrace_read_timestamp(rec),
+ rec->ip, rec->parent_ip, (void *)rec->ip,
+ (void *)rec->parent_ip);
return 0;
}
@@ -197,11 +199,14 @@ static int pstore_unlink(struct inode *dir, struct dentry *dentry)
if (err)
return err;
- if (p->psi->erase)
+ if (p->psi->erase) {
+ mutex_lock(&p->psi->read_mutex);
p->psi->erase(p->type, p->id, p->count,
d_inode(dentry)->i_ctime, p->psi);
- else
+ mutex_unlock(&p->psi->read_mutex);
+ } else {
return -EPERM;
+ }
return simple_unlink(dir, dentry);
}
diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h
index e38a22b31282..da416e6591c9 100644
--- a/fs/pstore/internal.h
+++ b/fs/pstore/internal.h
@@ -5,40 +5,6 @@
#include <linux/time.h>
#include <linux/pstore.h>
-#if NR_CPUS <= 2 && defined(CONFIG_ARM_THUMB)
-#define PSTORE_CPU_IN_IP 0x1
-#elif NR_CPUS <= 4 && defined(CONFIG_ARM)
-#define PSTORE_CPU_IN_IP 0x3
-#endif
-
-struct pstore_ftrace_record {
- unsigned long ip;
- unsigned long parent_ip;
-#ifndef PSTORE_CPU_IN_IP
- unsigned int cpu;
-#endif
-};
-
-static inline void
-pstore_ftrace_encode_cpu(struct pstore_ftrace_record *rec, unsigned int cpu)
-{
-#ifndef PSTORE_CPU_IN_IP
- rec->cpu = cpu;
-#else
- rec->ip |= cpu;
-#endif
-}
-
-static inline unsigned int
-pstore_ftrace_decode_cpu(struct pstore_ftrace_record *rec)
-{
-#ifndef PSTORE_CPU_IN_IP
- return rec->cpu;
-#else
- return rec->ip & PSTORE_CPU_IN_IP;
-#endif
-}
-
#ifdef CONFIG_PSTORE_FTRACE
extern void pstore_register_ftrace(void);
extern void pstore_unregister_ftrace(void);
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 14984d902a99..729677e18e36 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -493,6 +493,7 @@ static void pstore_dump(struct kmsg_dumper *dumper,
if (!is_locked) {
pr_err("pstore dump routine blocked in %s path, may corrupt error record\n"
, in_nmi() ? "NMI" : why);
+ return;
}
} else {
spin_lock_irqsave(&psinfo->buf_lock, flags);
@@ -584,8 +585,8 @@ static void pstore_console_write(struct console *con, const char *s, unsigned c)
} else {
spin_lock_irqsave(&psinfo->buf_lock, flags);
}
- memcpy(psinfo->buf, s, c);
- psinfo->write(PSTORE_TYPE_CONSOLE, 0, &id, 0, 0, 0, c, psinfo);
+ psinfo->write_buf(PSTORE_TYPE_CONSOLE, 0, &id, 0,
+ s, 0, c, psinfo);
spin_unlock_irqrestore(&psinfo->buf_lock, flags);
s += c;
c = e - s;
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index 6ad831b9d1b8..27c059e1760a 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -85,10 +85,10 @@ MODULE_PARM_DESC(ramoops_ecc,
"bytes ECC)");
struct ramoops_context {
- struct persistent_ram_zone **przs;
- struct persistent_ram_zone *cprz;
- struct persistent_ram_zone *fprz;
- struct persistent_ram_zone *mprz;
+ struct persistent_ram_zone **dprzs; /* Oops dump zones */
+ struct persistent_ram_zone *cprz; /* Console zone */
+ struct persistent_ram_zone **fprzs; /* Ftrace zones */
+ struct persistent_ram_zone *mprz; /* PMSG zone */
phys_addr_t phys_addr;
unsigned long size;
unsigned int memtype;
@@ -97,12 +97,14 @@ struct ramoops_context {
size_t ftrace_size;
size_t pmsg_size;
int dump_oops;
+ u32 flags;
struct persistent_ram_ecc_info ecc_info;
unsigned int max_dump_cnt;
unsigned int dump_write_cnt;
/* _read_cnt need clear on ramoops_pstore_open */
unsigned int dump_read_cnt;
unsigned int console_read_cnt;
+ unsigned int max_ftrace_cnt;
unsigned int ftrace_read_cnt;
unsigned int pmsg_read_cnt;
struct pstore_info pstore;
@@ -180,16 +182,69 @@ static bool prz_ok(struct persistent_ram_zone *prz)
persistent_ram_ecc_string(prz, NULL, 0));
}
+static ssize_t ftrace_log_combine(struct persistent_ram_zone *dest,
+ struct persistent_ram_zone *src)
+{
+ size_t dest_size, src_size, total, dest_off, src_off;
+ size_t dest_idx = 0, src_idx = 0, merged_idx = 0;
+ void *merged_buf;
+ struct pstore_ftrace_record *drec, *srec, *mrec;
+ size_t record_size = sizeof(struct pstore_ftrace_record);
+
+ dest_off = dest->old_log_size % record_size;
+ dest_size = dest->old_log_size - dest_off;
+
+ src_off = src->old_log_size % record_size;
+ src_size = src->old_log_size - src_off;
+
+ total = dest_size + src_size;
+ merged_buf = kmalloc(total, GFP_KERNEL);
+ if (!merged_buf)
+ return -ENOMEM;
+
+ drec = (struct pstore_ftrace_record *)(dest->old_log + dest_off);
+ srec = (struct pstore_ftrace_record *)(src->old_log + src_off);
+ mrec = (struct pstore_ftrace_record *)(merged_buf);
+
+ while (dest_size > 0 && src_size > 0) {
+ if (pstore_ftrace_read_timestamp(&drec[dest_idx]) <
+ pstore_ftrace_read_timestamp(&srec[src_idx])) {
+ mrec[merged_idx++] = drec[dest_idx++];
+ dest_size -= record_size;
+ } else {
+ mrec[merged_idx++] = srec[src_idx++];
+ src_size -= record_size;
+ }
+ }
+
+ while (dest_size > 0) {
+ mrec[merged_idx++] = drec[dest_idx++];
+ dest_size -= record_size;
+ }
+
+ while (src_size > 0) {
+ mrec[merged_idx++] = srec[src_idx++];
+ src_size -= record_size;
+ }
+
+ kfree(dest->old_log);
+ dest->old_log = merged_buf;
+ dest->old_log_size = total;
+
+ return 0;
+}
+
static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type,
int *count, struct timespec *time,
char **buf, bool *compressed,
ssize_t *ecc_notice_size,
struct pstore_info *psi)
{
- ssize_t size;
+ ssize_t size = 0;
struct ramoops_context *cxt = psi->data;
struct persistent_ram_zone *prz = NULL;
int header_length = 0;
+ bool free_prz = false;
/* Ramoops headers provide time stamps for PSTORE_TYPE_DMESG, but
* PSTORE_TYPE_CONSOLE and PSTORE_TYPE_FTRACE don't currently have
@@ -201,7 +256,7 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type,
/* Find the next valid persistent_ram_zone for DMESG */
while (cxt->dump_read_cnt < cxt->max_dump_cnt && !prz) {
- prz = ramoops_get_next_prz(cxt->przs, &cxt->dump_read_cnt,
+ prz = ramoops_get_next_prz(cxt->dprzs, &cxt->dump_read_cnt,
cxt->max_dump_cnt, id, type,
PSTORE_TYPE_DMESG, 1);
if (!prz_ok(prz))
@@ -219,14 +274,56 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type,
if (!prz_ok(prz))
prz = ramoops_get_next_prz(&cxt->cprz, &cxt->console_read_cnt,
1, id, type, PSTORE_TYPE_CONSOLE, 0);
- if (!prz_ok(prz))
- prz = ramoops_get_next_prz(&cxt->fprz, &cxt->ftrace_read_cnt,
- 1, id, type, PSTORE_TYPE_FTRACE, 0);
+
if (!prz_ok(prz))
prz = ramoops_get_next_prz(&cxt->mprz, &cxt->pmsg_read_cnt,
1, id, type, PSTORE_TYPE_PMSG, 0);
- if (!prz_ok(prz))
- return 0;
+
+ /* ftrace is last since it may want to dynamically allocate memory. */
+ if (!prz_ok(prz)) {
+ if (!(cxt->flags & RAMOOPS_FLAG_FTRACE_PER_CPU)) {
+ prz = ramoops_get_next_prz(cxt->fprzs,
+ &cxt->ftrace_read_cnt, 1, id, type,
+ PSTORE_TYPE_FTRACE, 0);
+ } else {
+ /*
+ * Build a new dummy record which combines all the
+ * per-cpu records including metadata and ecc info.
+ */
+ struct persistent_ram_zone *tmp_prz, *prz_next;
+
+ tmp_prz = kzalloc(sizeof(struct persistent_ram_zone),
+ GFP_KERNEL);
+ if (!tmp_prz)
+ return -ENOMEM;
+ free_prz = true;
+
+ while (cxt->ftrace_read_cnt < cxt->max_ftrace_cnt) {
+ prz_next = ramoops_get_next_prz(cxt->fprzs,
+ &cxt->ftrace_read_cnt,
+ cxt->max_ftrace_cnt, id,
+ type, PSTORE_TYPE_FTRACE, 0);
+
+ if (!prz_ok(prz_next))
+ continue;
+
+ tmp_prz->ecc_info = prz_next->ecc_info;
+ tmp_prz->corrected_bytes +=
+ prz_next->corrected_bytes;
+ tmp_prz->bad_blocks += prz_next->bad_blocks;
+ size = ftrace_log_combine(tmp_prz, prz_next);
+ if (size)
+ goto out;
+ }
+ *id = 0;
+ prz = tmp_prz;
+ }
+ }
+
+ if (!prz_ok(prz)) {
+ size = 0;
+ goto out;
+ }
size = persistent_ram_old_size(prz) - header_length;
@@ -234,12 +331,21 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type,
*ecc_notice_size = persistent_ram_ecc_string(prz, NULL, 0);
*buf = kmalloc(size + *ecc_notice_size + 1, GFP_KERNEL);
- if (*buf == NULL)
- return -ENOMEM;
+ if (*buf == NULL) {
+ size = -ENOMEM;
+ goto out;
+ }
memcpy(*buf, (char *)persistent_ram_old(prz) + header_length, size);
+
persistent_ram_ecc_string(prz, *buf + size, *ecc_notice_size + 1);
+out:
+ if (free_prz) {
+ kfree(prz->old_log);
+ kfree(prz);
+ }
+
return size;
}
@@ -283,15 +389,23 @@ static int notrace ramoops_pstore_write_buf(enum pstore_type_id type,
persistent_ram_write(cxt->cprz, buf, size);
return 0;
} else if (type == PSTORE_TYPE_FTRACE) {
- if (!cxt->fprz)
+ int zonenum;
+
+ if (!cxt->fprzs)
return -ENOMEM;
- persistent_ram_write(cxt->fprz, buf, size);
+ /*
+ * Choose zone by if we're using per-cpu buffers.
+ */
+ if (cxt->flags & RAMOOPS_FLAG_FTRACE_PER_CPU)
+ zonenum = smp_processor_id();
+ else
+ zonenum = 0;
+
+ persistent_ram_write(cxt->fprzs[zonenum], buf, size);
return 0;
} else if (type == PSTORE_TYPE_PMSG) {
- if (!cxt->mprz)
- return -ENOMEM;
- persistent_ram_write(cxt->mprz, buf, size);
- return 0;
+ pr_warn_ratelimited("PMSG shouldn't call %s\n", __func__);
+ return -EINVAL;
}
if (type != PSTORE_TYPE_DMESG)
@@ -316,10 +430,10 @@ static int notrace ramoops_pstore_write_buf(enum pstore_type_id type,
if (part != 1)
return -ENOSPC;
- if (!cxt->przs)
+ if (!cxt->dprzs)
return -ENOSPC;
- prz = cxt->przs[cxt->dump_write_cnt];
+ prz = cxt->dprzs[cxt->dump_write_cnt];
hlen = ramoops_write_kmsg_hdr(prz, compressed);
if (size + hlen > prz->buffer_size)
@@ -359,13 +473,15 @@ static int ramoops_pstore_erase(enum pstore_type_id type, u64 id, int count,
case PSTORE_TYPE_DMESG:
if (id >= cxt->max_dump_cnt)
return -EINVAL;
- prz = cxt->przs[id];
+ prz = cxt->dprzs[id];
break;
case PSTORE_TYPE_CONSOLE:
prz = cxt->cprz;
break;
case PSTORE_TYPE_FTRACE:
- prz = cxt->fprz;
+ if (id >= cxt->max_ftrace_cnt)
+ return -EINVAL;
+ prz = cxt->fprzs[id];
break;
case PSTORE_TYPE_PMSG:
prz = cxt->mprz;
@@ -396,68 +512,113 @@ static void ramoops_free_przs(struct ramoops_context *cxt)
{
int i;
- if (!cxt->przs)
- return;
+ /* Free dump PRZs */
+ if (cxt->dprzs) {
+ for (i = 0; i < cxt->max_dump_cnt; i++)
+ persistent_ram_free(cxt->dprzs[i]);
- for (i = 0; i < cxt->max_dump_cnt; i++)
- persistent_ram_free(cxt->przs[i]);
+ kfree(cxt->dprzs);
+ cxt->max_dump_cnt = 0;
+ }
- kfree(cxt->przs);
- cxt->max_dump_cnt = 0;
+ /* Free ftrace PRZs */
+ if (cxt->fprzs) {
+ for (i = 0; i < cxt->max_ftrace_cnt; i++)
+ persistent_ram_free(cxt->fprzs[i]);
+ kfree(cxt->fprzs);
+ cxt->max_ftrace_cnt = 0;
+ }
}
-static int ramoops_init_przs(struct device *dev, struct ramoops_context *cxt,
- phys_addr_t *paddr, size_t dump_mem_sz)
+static int ramoops_init_przs(const char *name,
+ struct device *dev, struct ramoops_context *cxt,
+ struct persistent_ram_zone ***przs,
+ phys_addr_t *paddr, size_t mem_sz,
+ ssize_t record_size,
+ unsigned int *cnt, u32 sig, u32 flags)
{
int err = -ENOMEM;
int i;
+ size_t zone_sz;
+ struct persistent_ram_zone **prz_ar;
- if (!cxt->record_size)
+ /* Allocate nothing for 0 mem_sz or 0 record_size. */
+ if (mem_sz == 0 || record_size == 0) {
+ *cnt = 0;
return 0;
+ }
- if (*paddr + dump_mem_sz - cxt->phys_addr > cxt->size) {
- dev_err(dev, "no room for dumps\n");
- return -ENOMEM;
+ /*
+ * If we have a negative record size, calculate it based on
+ * mem_sz / *cnt. If we have a positive record size, calculate
+ * cnt from mem_sz / record_size.
+ */
+ if (record_size < 0) {
+ if (*cnt == 0)
+ return 0;
+ record_size = mem_sz / *cnt;
+ if (record_size == 0) {
+ dev_err(dev, "%s record size == 0 (%zu / %u)\n",
+ name, mem_sz, *cnt);
+ goto fail;
+ }
+ } else {
+ *cnt = mem_sz / record_size;
+ if (*cnt == 0) {
+ dev_err(dev, "%s record count == 0 (%zu / %zu)\n",
+ name, mem_sz, record_size);
+ goto fail;
+ }
}
- cxt->max_dump_cnt = dump_mem_sz / cxt->record_size;
- if (!cxt->max_dump_cnt)
- return -ENOMEM;
+ if (*paddr + mem_sz - cxt->phys_addr > cxt->size) {
+ dev_err(dev, "no room for %s mem region (0x%zx@0x%llx) in (0x%lx@0x%llx)\n",
+ name,
+ mem_sz, (unsigned long long)*paddr,
+ cxt->size, (unsigned long long)cxt->phys_addr);
+ goto fail;
+ }
- cxt->przs = kzalloc(sizeof(*cxt->przs) * cxt->max_dump_cnt,
- GFP_KERNEL);
- if (!cxt->przs) {
- dev_err(dev, "failed to initialize a prz array for dumps\n");
- goto fail_mem;
+ zone_sz = mem_sz / *cnt;
+ if (!zone_sz) {
+ dev_err(dev, "%s zone size == 0\n", name);
+ goto fail;
}
- for (i = 0; i < cxt->max_dump_cnt; i++) {
- cxt->przs[i] = persistent_ram_new(*paddr, cxt->record_size, 0,
+ prz_ar = kcalloc(*cnt, sizeof(**przs), GFP_KERNEL);
+ if (!prz_ar)
+ goto fail;
+
+ for (i = 0; i < *cnt; i++) {
+ prz_ar[i] = persistent_ram_new(*paddr, zone_sz, sig,
&cxt->ecc_info,
- cxt->memtype);
- if (IS_ERR(cxt->przs[i])) {
- err = PTR_ERR(cxt->przs[i]);
- dev_err(dev, "failed to request mem region (0x%zx@0x%llx): %d\n",
- cxt->record_size, (unsigned long long)*paddr, err);
+ cxt->memtype, flags);
+ if (IS_ERR(prz_ar[i])) {
+ err = PTR_ERR(prz_ar[i]);
+ dev_err(dev, "failed to request %s mem region (0x%zx@0x%llx): %d\n",
+ name, record_size,
+ (unsigned long long)*paddr, err);
while (i > 0) {
i--;
- persistent_ram_free(cxt->przs[i]);
+ persistent_ram_free(prz_ar[i]);
}
- goto fail_prz;
+ kfree(prz_ar);
+ goto fail;
}
- *paddr += cxt->record_size;
+ *paddr += zone_sz;
}
+ *przs = prz_ar;
return 0;
-fail_prz:
- kfree(cxt->przs);
-fail_mem:
- cxt->max_dump_cnt = 0;
+
+fail:
+ *cnt = 0;
return err;
}
-static int ramoops_init_prz(struct device *dev, struct ramoops_context *cxt,
+static int ramoops_init_prz(const char *name,
+ struct device *dev, struct ramoops_context *cxt,
struct persistent_ram_zone **prz,
phys_addr_t *paddr, size_t sz, u32 sig)
{
@@ -465,18 +626,19 @@ static int ramoops_init_prz(struct device *dev, struct ramoops_context *cxt,
return 0;
if (*paddr + sz - cxt->phys_addr > cxt->size) {
- dev_err(dev, "no room for mem region (0x%zx@0x%llx) in (0x%lx@0x%llx)\n",
- sz, (unsigned long long)*paddr,
+ dev_err(dev, "no room for %s mem region (0x%zx@0x%llx) in (0x%lx@0x%llx)\n",
+ name, sz, (unsigned long long)*paddr,
cxt->size, (unsigned long long)cxt->phys_addr);
return -ENOMEM;
}
- *prz = persistent_ram_new(*paddr, sz, sig, &cxt->ecc_info, cxt->memtype);
+ *prz = persistent_ram_new(*paddr, sz, sig, &cxt->ecc_info,
+ cxt->memtype, 0);
if (IS_ERR(*prz)) {
int err = PTR_ERR(*prz);
- dev_err(dev, "failed to request mem region (0x%zx@0x%llx): %d\n",
- sz, (unsigned long long)*paddr, err);
+ dev_err(dev, "failed to request %s mem region (0x%zx@0x%llx): %d\n",
+ name, sz, (unsigned long long)*paddr, err);
return err;
}
@@ -543,6 +705,7 @@ static int ramoops_parse_dt(struct platform_device *pdev,
parse_size("ftrace-size", pdata->ftrace_size);
parse_size("pmsg-size", pdata->pmsg_size);
parse_size("ecc-size", pdata->ecc_info.ecc_size);
+ parse_size("flags", pdata->flags);
#undef parse_size
@@ -561,6 +724,7 @@ static int ramoops_probe(struct platform_device *pdev)
if (dev_of_node(dev) && !pdata) {
pdata = devm_kzalloc(&pdev->dev, sizeof(*pdata), GFP_KERNEL);
if (!pdata) {
+ pr_err("cannot allocate platform data buffer\n");
err = -ENOMEM;
goto fail_out;
}
@@ -570,11 +734,20 @@ static int ramoops_probe(struct platform_device *pdev)
goto fail_out;
}
- /* Only a single ramoops area allowed at a time, so fail extra
+ /*
+ * Only a single ramoops area allowed at a time, so fail extra
* probes.
*/
- if (cxt->max_dump_cnt)
+ if (cxt->max_dump_cnt) {
+ pr_err("already initialized\n");
goto fail_out;
+ }
+
+ /* Make sure we didn't get bogus platform data pointer. */
+ if (!pdata) {
+ pr_err("NULL platform data\n");
+ goto fail_out;
+ }
if (!pdata->mem_size || (!pdata->record_size && !pdata->console_size &&
!pdata->ftrace_size && !pdata->pmsg_size)) {
@@ -600,27 +773,37 @@ static int ramoops_probe(struct platform_device *pdev)
cxt->ftrace_size = pdata->ftrace_size;
cxt->pmsg_size = pdata->pmsg_size;
cxt->dump_oops = pdata->dump_oops;
+ cxt->flags = pdata->flags;
cxt->ecc_info = pdata->ecc_info;
paddr = cxt->phys_addr;
dump_mem_sz = cxt->size - cxt->console_size - cxt->ftrace_size
- cxt->pmsg_size;
- err = ramoops_init_przs(dev, cxt, &paddr, dump_mem_sz);
+ err = ramoops_init_przs("dump", dev, cxt, &cxt->dprzs, &paddr,
+ dump_mem_sz, cxt->record_size,
+ &cxt->max_dump_cnt, 0, 0);
if (err)
goto fail_out;
- err = ramoops_init_prz(dev, cxt, &cxt->cprz, &paddr,
+ err = ramoops_init_prz("console", dev, cxt, &cxt->cprz, &paddr,
cxt->console_size, 0);
if (err)
goto fail_init_cprz;
- err = ramoops_init_prz(dev, cxt, &cxt->fprz, &paddr, cxt->ftrace_size,
- LINUX_VERSION_CODE);
+ cxt->max_ftrace_cnt = (cxt->flags & RAMOOPS_FLAG_FTRACE_PER_CPU)
+ ? nr_cpu_ids
+ : 1;
+ err = ramoops_init_przs("ftrace", dev, cxt, &cxt->fprzs, &paddr,
+ cxt->ftrace_size, -1,
+ &cxt->max_ftrace_cnt, LINUX_VERSION_CODE,
+ (cxt->flags & RAMOOPS_FLAG_FTRACE_PER_CPU)
+ ? PRZ_FLAG_NO_LOCK : 0);
if (err)
goto fail_init_fprz;
- err = ramoops_init_prz(dev, cxt, &cxt->mprz, &paddr, cxt->pmsg_size, 0);
+ err = ramoops_init_prz("pmsg", dev, cxt, &cxt->mprz, &paddr,
+ cxt->pmsg_size, 0);
if (err)
goto fail_init_mprz;
@@ -680,7 +863,6 @@ fail_clear:
cxt->pstore.bufsize = 0;
persistent_ram_free(cxt->mprz);
fail_init_mprz:
- persistent_ram_free(cxt->fprz);
fail_init_fprz:
persistent_ram_free(cxt->cprz);
fail_init_cprz:
@@ -699,7 +881,6 @@ static int ramoops_remove(struct platform_device *pdev)
cxt->pstore.bufsize = 0;
persistent_ram_free(cxt->mprz);
- persistent_ram_free(cxt->fprz);
persistent_ram_free(cxt->cprz);
ramoops_free_przs(cxt);
@@ -741,6 +922,8 @@ static void ramoops_register_dummy(void)
dummy_data->ftrace_size = ramoops_ftrace_size;
dummy_data->pmsg_size = ramoops_pmsg_size;
dummy_data->dump_oops = dump_oops;
+ dummy_data->flags = RAMOOPS_FLAG_FTRACE_PER_CPU;
+
/*
* For backwards compatibility ramoops.ecc=1 means 16 bytes ECC
* (using 1 byte for ECC isn't much of use anyway).
diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c
index 3975deec02f8..a857338b7dab 100644
--- a/fs/pstore/ram_core.c
+++ b/fs/pstore/ram_core.c
@@ -48,16 +48,15 @@ static inline size_t buffer_start(struct persistent_ram_zone *prz)
return atomic_read(&prz->buffer->start);
}
-static DEFINE_RAW_SPINLOCK(buffer_lock);
-
/* increase and wrap the start pointer, returning the old value */
static size_t buffer_start_add(struct persistent_ram_zone *prz, size_t a)
{
int old;
int new;
- unsigned long flags;
+ unsigned long flags = 0;
- raw_spin_lock_irqsave(&buffer_lock, flags);
+ if (!(prz->flags & PRZ_FLAG_NO_LOCK))
+ raw_spin_lock_irqsave(&prz->buffer_lock, flags);
old = atomic_read(&prz->buffer->start);
new = old + a;
@@ -65,7 +64,8 @@ static size_t buffer_start_add(struct persistent_ram_zone *prz, size_t a)
new -= prz->buffer_size;
atomic_set(&prz->buffer->start, new);
- raw_spin_unlock_irqrestore(&buffer_lock, flags);
+ if (!(prz->flags & PRZ_FLAG_NO_LOCK))
+ raw_spin_unlock_irqrestore(&prz->buffer_lock, flags);
return old;
}
@@ -75,9 +75,10 @@ static void buffer_size_add(struct persistent_ram_zone *prz, size_t a)
{
size_t old;
size_t new;
- unsigned long flags;
+ unsigned long flags = 0;
- raw_spin_lock_irqsave(&buffer_lock, flags);
+ if (!(prz->flags & PRZ_FLAG_NO_LOCK))
+ raw_spin_lock_irqsave(&prz->buffer_lock, flags);
old = atomic_read(&prz->buffer->size);
if (old == prz->buffer_size)
@@ -89,7 +90,8 @@ static void buffer_size_add(struct persistent_ram_zone *prz, size_t a)
atomic_set(&prz->buffer->size, new);
exit:
- raw_spin_unlock_irqrestore(&buffer_lock, flags);
+ if (!(prz->flags & PRZ_FLAG_NO_LOCK))
+ raw_spin_unlock_irqrestore(&prz->buffer_lock, flags);
}
static void notrace persistent_ram_encode_rs8(struct persistent_ram_zone *prz,
@@ -465,7 +467,8 @@ static int persistent_ram_buffer_map(phys_addr_t start, phys_addr_t size,
}
static int persistent_ram_post_init(struct persistent_ram_zone *prz, u32 sig,
- struct persistent_ram_ecc_info *ecc_info)
+ struct persistent_ram_ecc_info *ecc_info,
+ unsigned long flags)
{
int ret;
@@ -493,6 +496,8 @@ static int persistent_ram_post_init(struct persistent_ram_zone *prz, u32 sig,
prz->buffer->sig = sig;
persistent_ram_zap(prz);
+ prz->buffer_lock = __RAW_SPIN_LOCK_UNLOCKED(buffer_lock);
+ prz->flags = flags;
return 0;
}
@@ -517,7 +522,7 @@ void persistent_ram_free(struct persistent_ram_zone *prz)
struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size,
u32 sig, struct persistent_ram_ecc_info *ecc_info,
- unsigned int memtype)
+ unsigned int memtype, u32 flags)
{
struct persistent_ram_zone *prz;
int ret = -ENOMEM;
@@ -532,7 +537,7 @@ struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size,
if (ret)
goto err;
- ret = persistent_ram_post_init(prz, sig, ecc_info);
+ ret = persistent_ram_post_init(prz, sig, ecc_info, flags);
if (ret)
goto err;
diff --git a/fs/quota/netlink.c b/fs/quota/netlink.c
index 8b252673d454..e99b1a72d9a7 100644
--- a/fs/quota/netlink.c
+++ b/fs/quota/netlink.c
@@ -12,14 +12,8 @@ static const struct genl_multicast_group quota_mcgrps[] = {
};
/* Netlink family structure for quota */
-static struct genl_family quota_genl_family = {
- /*
- * Needed due to multicast group ID abuse - old code assumed
- * the family ID was also a valid multicast group ID (which
- * isn't true) and userspace might thus rely on it. Assign a
- * static ID for this group to make dealing with that easier.
- */
- .id = GENL_ID_VFS_DQUOT,
+static struct genl_family quota_genl_family __ro_after_init = {
+ .module = THIS_MODULE,
.hdrsize = 0,
.name = "VFS_DQUOT",
.version = 1,
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 58b2dedb2a3a..cfeae9b0a2b7 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -19,6 +19,7 @@
#include <linux/quotaops.h>
#include <linux/swap.h>
#include <linux/uio.h>
+#include <linux/bio.h>
int reiserfs_commit_write(struct file *f, struct page *page,
unsigned from, unsigned to);
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index bc2dde2423c2..aa40c242f1db 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -1111,7 +1111,8 @@ static int flush_commit_list(struct super_block *s,
mark_buffer_dirty(jl->j_commit_bh) ;
depth = reiserfs_write_unlock_nested(s);
if (reiserfs_barrier_flush(s))
- __sync_dirty_buffer(jl->j_commit_bh, WRITE_FLUSH_FUA);
+ __sync_dirty_buffer(jl->j_commit_bh,
+ REQ_PREFLUSH | REQ_FUA);
else
sync_dirty_buffer(jl->j_commit_bh);
reiserfs_write_lock_nested(s, depth);
@@ -1269,7 +1270,8 @@ static int _update_journal_header_block(struct super_block *sb,
depth = reiserfs_write_unlock_nested(sb);
if (reiserfs_barrier_flush(sb))
- __sync_dirty_buffer(journal->j_header_bh, WRITE_FLUSH_FUA);
+ __sync_dirty_buffer(journal->j_header_bh,
+ REQ_PREFLUSH | REQ_FUA);
else
sync_dirty_buffer(journal->j_header_bh);
diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c
index a97e352d05d3..0037aea97d39 100644
--- a/fs/reiserfs/stree.c
+++ b/fs/reiserfs/stree.c
@@ -11,6 +11,7 @@
#include <linux/time.h>
#include <linux/string.h>
#include <linux/pagemap.h>
+#include <linux/bio.h>
#include "reiserfs.h"
#include <linux/buffer_head.h>
#include <linux/quotaops.h>
diff --git a/fs/splice.c b/fs/splice.c
index 5a7750bd2eea..8ed7c9d8c0fb 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -17,6 +17,7 @@
* Copyright (C) 2006 Ingo Molnar <mingo@elte.hu>
*
*/
+#include <linux/bvec.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/pagemap.h>
diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index ce62a380314f..2751476e6b6e 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -31,6 +31,7 @@
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/buffer_head.h>
+#include <linux/bio.h>
#include "squashfs_fs.h"
#include "squashfs_fs_sb.h"
diff --git a/fs/ubifs/Kconfig b/fs/ubifs/Kconfig
index 7ff7712f284e..0a908ae7af13 100644
--- a/fs/ubifs/Kconfig
+++ b/fs/ubifs/Kconfig
@@ -50,3 +50,14 @@ config UBIFS_ATIME_SUPPORT
strictatime is the "heavy", relatime is "lighter", etc.
If unsure, say 'N'
+
+config UBIFS_FS_ENCRYPTION
+ bool "UBIFS Encryption"
+ depends on UBIFS_FS
+ select FS_ENCRYPTION
+ default n
+ help
+ Enable encryption of UBIFS files and directories. This
+ feature is similar to ecryptfs, but it is more memory
+ efficient since it avoids caching the encrypted and
+ decrypted pages in the page cache.
diff --git a/fs/ubifs/Makefile b/fs/ubifs/Makefile
index c54a24360f85..6f3251c2bf08 100644
--- a/fs/ubifs/Makefile
+++ b/fs/ubifs/Makefile
@@ -5,3 +5,4 @@ ubifs-y += tnc.o master.o scan.o replay.o log.o commit.o gc.o orphan.o
ubifs-y += budget.o find.o tnc_commit.o compress.o lpt.o lprops.o
ubifs-y += recovery.o ioctl.o lpt_commit.o tnc_misc.o xattr.o debug.o
ubifs-y += misc.o
+ubifs-$(CONFIG_UBIFS_FS_ENCRYPTION) += crypto.o
diff --git a/fs/ubifs/crypto.c b/fs/ubifs/crypto.c
new file mode 100644
index 000000000000..3402720f2b28
--- /dev/null
+++ b/fs/ubifs/crypto.c
@@ -0,0 +1,97 @@
+#include "ubifs.h"
+
+static int ubifs_crypt_get_context(struct inode *inode, void *ctx, size_t len)
+{
+ return ubifs_xattr_get(inode, UBIFS_XATTR_NAME_ENCRYPTION_CONTEXT,
+ ctx, len);
+}
+
+static int ubifs_crypt_set_context(struct inode *inode, const void *ctx,
+ size_t len, void *fs_data)
+{
+ return ubifs_xattr_set(inode, UBIFS_XATTR_NAME_ENCRYPTION_CONTEXT,
+ ctx, len, 0);
+}
+
+static bool ubifs_crypt_empty_dir(struct inode *inode)
+{
+ return ubifs_check_dir_empty(inode) == 0;
+}
+
+static unsigned int ubifs_crypt_max_namelen(struct inode *inode)
+{
+ if (S_ISLNK(inode->i_mode))
+ return UBIFS_MAX_INO_DATA;
+ else
+ return UBIFS_MAX_NLEN;
+}
+
+static int ubifs_key_prefix(struct inode *inode, u8 **key)
+{
+ static char prefix[] = "ubifs:";
+
+ *key = prefix;
+
+ return sizeof(prefix) - 1;
+}
+
+int ubifs_encrypt(const struct inode *inode, struct ubifs_data_node *dn,
+ unsigned int in_len, unsigned int *out_len, int block)
+{
+ struct ubifs_info *c = inode->i_sb->s_fs_info;
+ void *p = &dn->data;
+ struct page *ret;
+ unsigned int pad_len = round_up(in_len, UBIFS_CIPHER_BLOCK_SIZE);
+
+ ubifs_assert(pad_len <= *out_len);
+ dn->compr_size = cpu_to_le16(in_len);
+
+ /* pad to full block cipher length */
+ if (pad_len != in_len)
+ memset(p + in_len, 0, pad_len - in_len);
+
+ ret = fscrypt_encrypt_page(inode, virt_to_page(&dn->data), pad_len,
+ offset_in_page(&dn->data), block, GFP_NOFS);
+ if (IS_ERR(ret)) {
+ ubifs_err(c, "fscrypt_encrypt_page failed: %ld", PTR_ERR(ret));
+ return PTR_ERR(ret);
+ }
+ *out_len = pad_len;
+
+ return 0;
+}
+
+int ubifs_decrypt(const struct inode *inode, struct ubifs_data_node *dn,
+ unsigned int *out_len, int block)
+{
+ struct ubifs_info *c = inode->i_sb->s_fs_info;
+ int err;
+ unsigned int clen = le16_to_cpu(dn->compr_size);
+ unsigned int dlen = *out_len;
+
+ if (clen <= 0 || clen > UBIFS_BLOCK_SIZE || clen > dlen) {
+ ubifs_err(c, "bad compr_size: %i", clen);
+ return -EINVAL;
+ }
+
+ ubifs_assert(dlen <= UBIFS_BLOCK_SIZE);
+ err = fscrypt_decrypt_page(inode, virt_to_page(&dn->data), dlen,
+ offset_in_page(&dn->data), block);
+ if (err) {
+ ubifs_err(c, "fscrypt_decrypt_page failed: %i", err);
+ return err;
+ }
+ *out_len = clen;
+
+ return 0;
+}
+
+struct fscrypt_operations ubifs_crypt_operations = {
+ .flags = FS_CFLG_OWN_PAGES,
+ .get_context = ubifs_crypt_get_context,
+ .set_context = ubifs_crypt_set_context,
+ .is_encrypted = __ubifs_crypt_is_encrypted,
+ .empty_dir = ubifs_crypt_empty_dir,
+ .max_namelen = ubifs_crypt_max_namelen,
+ .key_prefix = ubifs_key_prefix,
+};
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 69e287e20732..1e712a364680 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -233,7 +233,7 @@ static void dump_ch(const struct ubifs_ch *ch)
void ubifs_dump_inode(struct ubifs_info *c, const struct inode *inode)
{
const struct ubifs_inode *ui = ubifs_inode(inode);
- struct qstr nm = { .name = NULL };
+ struct fscrypt_name nm = {0};
union ubifs_key key;
struct ubifs_dent_node *dent, *pdent = NULL;
int count = 2;
@@ -289,8 +289,8 @@ void ubifs_dump_inode(struct ubifs_info *c, const struct inode *inode)
pr_err("\t%d: %s (%s)\n",
count++, dent->name, get_dent_type(dent->type));
- nm.name = dent->name;
- nm.len = le16_to_cpu(dent->nlen);
+ fname_name(&nm) = dent->name;
+ fname_len(&nm) = le16_to_cpu(dent->nlen);
kfree(pdent);
pdent = dent;
key_read(c, &dent->key, &key);
@@ -1107,7 +1107,7 @@ int dbg_check_dir(struct ubifs_info *c, const struct inode *dir)
unsigned int nlink = 2;
union ubifs_key key;
struct ubifs_dent_node *dent, *pdent = NULL;
- struct qstr nm = { .name = NULL };
+ struct fscrypt_name nm = {0};
loff_t size = UBIFS_INO_NODE_SZ;
if (!dbg_is_chk_gen(c))
@@ -1128,9 +1128,9 @@ int dbg_check_dir(struct ubifs_info *c, const struct inode *dir)
return err;
}
- nm.name = dent->name;
- nm.len = le16_to_cpu(dent->nlen);
- size += CALC_DENT_SIZE(nm.len);
+ fname_name(&nm) = dent->name;
+ fname_len(&nm) = le16_to_cpu(dent->nlen);
+ size += CALC_DENT_SIZE(fname_len(&nm));
if (dent->type == UBIFS_ITYPE_DIR)
nlink += 1;
kfree(pdent);
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index ca16c5d7bab1..1c5331ac9614 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -85,11 +85,26 @@ static int inherit_flags(const struct inode *dir, umode_t mode)
* initializes it. Returns new inode in case of success and an error code in
* case of failure.
*/
-struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir,
+struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir,
umode_t mode)
{
+ int err;
struct inode *inode;
struct ubifs_inode *ui;
+ bool encrypted = false;
+
+ if (ubifs_crypt_is_encrypted(dir)) {
+ err = fscrypt_get_encryption_info(dir);
+ if (err) {
+ ubifs_err(c, "fscrypt_get_encryption_info failed: %i", err);
+ return ERR_PTR(err);
+ }
+
+ if (!fscrypt_has_encryption_key(dir))
+ return ERR_PTR(-EPERM);
+
+ encrypted = true;
+ }
inode = new_inode(c->vfs_sb);
ui = ubifs_inode(inode);
@@ -165,18 +180,29 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir,
*/
ui->creat_sqnum = ++c->max_sqnum;
spin_unlock(&c->cnt_lock);
+
+ if (encrypted) {
+ err = fscrypt_inherit_context(dir, inode, &encrypted, true);
+ if (err) {
+ ubifs_err(c, "fscrypt_inherit_context failed: %i", err);
+ make_bad_inode(inode);
+ iput(inode);
+ return ERR_PTR(err);
+ }
+ }
+
return inode;
}
static int dbg_check_name(const struct ubifs_info *c,
const struct ubifs_dent_node *dent,
- const struct qstr *nm)
+ const struct fscrypt_name *nm)
{
if (!dbg_is_chk_gen(c))
return 0;
- if (le16_to_cpu(dent->nlen) != nm->len)
+ if (le16_to_cpu(dent->nlen) != fname_len(nm))
return -EINVAL;
- if (memcmp(dent->name, nm->name, nm->len))
+ if (memcmp(dent->name, fname_name(nm), fname_len(nm)))
return -EINVAL;
return 0;
}
@@ -189,30 +215,61 @@ static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry,
struct inode *inode = NULL;
struct ubifs_dent_node *dent;
struct ubifs_info *c = dir->i_sb->s_fs_info;
+ struct fscrypt_name nm;
dbg_gen("'%pd' in dir ino %lu", dentry, dir->i_ino);
- if (dentry->d_name.len > UBIFS_MAX_NLEN)
- return ERR_PTR(-ENAMETOOLONG);
+ if (ubifs_crypt_is_encrypted(dir)) {
+ err = fscrypt_get_encryption_info(dir);
+
+ /*
+ * DCACHE_ENCRYPTED_WITH_KEY is set if the dentry is
+ * created while the directory was encrypted and we
+ * have access to the key.
+ */
+ if (fscrypt_has_encryption_key(dir))
+ fscrypt_set_encrypted_dentry(dentry);
+ fscrypt_set_d_op(dentry);
+ if (err && err != -ENOKEY)
+ return ERR_PTR(err);
+ }
+
+ err = fscrypt_setup_filename(dir, &dentry->d_name, 1, &nm);
+ if (err)
+ return ERR_PTR(err);
+
+ if (fname_len(&nm) > UBIFS_MAX_NLEN) {
+ err = -ENAMETOOLONG;
+ goto out_fname;
+ }
dent = kmalloc(UBIFS_MAX_DENT_NODE_SZ, GFP_NOFS);
- if (!dent)
- return ERR_PTR(-ENOMEM);
+ if (!dent) {
+ err = -ENOMEM;
+ goto out_fname;
+ }
- dent_key_init(c, &key, dir->i_ino, &dentry->d_name);
+ if (nm.hash) {
+ ubifs_assert(fname_len(&nm) == 0);
+ ubifs_assert(fname_name(&nm) == NULL);
+ dent_key_init_hash(c, &key, dir->i_ino, nm.hash);
+ err = ubifs_tnc_lookup_dh(c, &key, dent, nm.minor_hash);
+ } else {
+ dent_key_init(c, &key, dir->i_ino, &nm);
+ err = ubifs_tnc_lookup_nm(c, &key, dent, &nm);
+ }
- err = ubifs_tnc_lookup_nm(c, &key, dent, &dentry->d_name);
if (err) {
if (err == -ENOENT) {
dbg_gen("not found");
goto done;
}
- goto out;
+ goto out_dent;
}
- if (dbg_check_name(c, dent, &dentry->d_name)) {
+ if (dbg_check_name(c, dent, &nm)) {
err = -EINVAL;
- goto out;
+ goto out_dent;
}
inode = ubifs_iget(dir->i_sb, le64_to_cpu(dent->inum));
@@ -225,11 +282,12 @@ static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry,
ubifs_err(c, "dead directory entry '%pd', error %d",
dentry, err);
ubifs_ro_mode(c, err);
- goto out;
+ goto out_dent;
}
done:
kfree(dent);
+ fscrypt_free_filename(&nm);
/*
* Note, d_splice_alias() would be required instead if we supported
* NFS.
@@ -237,8 +295,10 @@ done:
d_add(dentry, inode);
return NULL;
-out:
+out_dent:
kfree(dent);
+out_fname:
+ fscrypt_free_filename(&nm);
return ERR_PTR(err);
}
@@ -247,10 +307,11 @@ static int ubifs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
{
struct inode *inode;
struct ubifs_info *c = dir->i_sb->s_fs_info;
- int err, sz_change = CALC_DENT_SIZE(dentry->d_name.len);
struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
.dirtied_ino = 1 };
struct ubifs_inode *dir_ui = ubifs_inode(dir);
+ struct fscrypt_name nm;
+ int err, sz_change;
/*
* Budget request settings: new inode, new direntry, changing the
@@ -264,10 +325,16 @@ static int ubifs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
if (err)
return err;
+ err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &nm);
+ if (err)
+ goto out_budg;
+
+ sz_change = CALC_DENT_SIZE(fname_len(&nm));
+
inode = ubifs_new_inode(c, dir, mode);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
- goto out_budg;
+ goto out_fname;
}
err = ubifs_init_security(dir, inode, &dentry->d_name);
@@ -278,12 +345,13 @@ static int ubifs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
dir->i_size += sz_change;
dir_ui->ui_size = dir->i_size;
dir->i_mtime = dir->i_ctime = inode->i_ctime;
- err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0);
+ err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0);
if (err)
goto out_cancel;
mutex_unlock(&dir_ui->ui_mutex);
ubifs_release_budget(c, &req);
+ fscrypt_free_filename(&nm);
insert_inode_hash(inode);
d_instantiate(dentry, inode);
return 0;
@@ -295,6 +363,8 @@ out_cancel:
out_inode:
make_bad_inode(inode);
iput(inode);
+out_fname:
+ fscrypt_free_filename(&nm);
out_budg:
ubifs_release_budget(c, &req);
ubifs_err(c, "cannot create regular file, error %d", err);
@@ -310,6 +380,7 @@ static int do_tmpfile(struct inode *dir, struct dentry *dentry,
struct ubifs_budget_req ino_req = { .dirtied_ino = 1 };
struct ubifs_inode *ui, *dir_ui = ubifs_inode(dir);
int err, instantiated = 0;
+ struct fscrypt_name nm;
/*
* Budget request settings: new dirty inode, new direntry,
@@ -319,13 +390,30 @@ static int do_tmpfile(struct inode *dir, struct dentry *dentry,
dbg_gen("dent '%pd', mode %#hx in dir ino %lu",
dentry, mode, dir->i_ino);
- err = ubifs_budget_space(c, &req);
+ if (ubifs_crypt_is_encrypted(dir)) {
+ err = fscrypt_get_encryption_info(dir);
+ if (err)
+ return err;
+
+ if (!fscrypt_has_encryption_key(dir)) {
+ return -EPERM;
+ }
+ }
+
+ err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &nm);
if (err)
return err;
+ err = ubifs_budget_space(c, &req);
+ if (err) {
+ fscrypt_free_filename(&nm);
+ return err;
+ }
+
err = ubifs_budget_space(c, &ino_req);
if (err) {
ubifs_release_budget(c, &req);
+ fscrypt_free_filename(&nm);
return err;
}
@@ -361,7 +449,7 @@ static int do_tmpfile(struct inode *dir, struct dentry *dentry,
mutex_unlock(&ui->ui_mutex);
mutex_lock(&dir_ui->ui_mutex);
- err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 1, 0);
+ err = ubifs_jnl_update(c, dir, &nm, inode, 1, 0);
if (err)
goto out_cancel;
mutex_unlock(&dir_ui->ui_mutex);
@@ -380,6 +468,7 @@ out_budg:
ubifs_release_budget(c, &req);
if (!instantiated)
ubifs_release_budget(c, &ino_req);
+ fscrypt_free_filename(&nm);
ubifs_err(c, "cannot create temporary file, error %d", err);
return err;
}
@@ -439,12 +528,14 @@ static unsigned int vfs_dent_type(uint8_t type)
*/
static int ubifs_readdir(struct file *file, struct dir_context *ctx)
{
- int err = 0;
- struct qstr nm;
+ int fstr_real_len = 0, err = 0;
+ struct fscrypt_name nm;
+ struct fscrypt_str fstr = {0};
union ubifs_key key;
struct ubifs_dent_node *dent;
struct inode *dir = file_inode(file);
struct ubifs_info *c = dir->i_sb->s_fs_info;
+ bool encrypted = ubifs_crypt_is_encrypted(dir);
dbg_gen("dir ino %lu, f_pos %#llx", dir->i_ino, ctx->pos);
@@ -455,6 +546,18 @@ static int ubifs_readdir(struct file *file, struct dir_context *ctx)
*/
return 0;
+ if (encrypted) {
+ err = fscrypt_get_encryption_info(dir);
+ if (err && err != -ENOKEY)
+ return err;
+
+ err = fscrypt_fname_alloc_buffer(dir, UBIFS_MAX_NLEN, &fstr);
+ if (err)
+ return err;
+
+ fstr_real_len = fstr.len;
+ }
+
if (file->f_version == 0) {
/*
* The file was seek'ed, which means that @file->private_data
@@ -476,12 +579,15 @@ static int ubifs_readdir(struct file *file, struct dir_context *ctx)
/* File positions 0 and 1 correspond to "." and ".." */
if (ctx->pos < 2) {
ubifs_assert(!file->private_data);
- if (!dir_emit_dots(file, ctx))
+ if (!dir_emit_dots(file, ctx)) {
+ if (encrypted)
+ fscrypt_fname_free_buffer(&fstr);
return 0;
+ }
/* Find the first entry in TNC and save it */
lowest_dent_key(c, &key, dir->i_ino);
- nm.name = NULL;
+ fname_len(&nm) = 0;
dent = ubifs_tnc_next_ent(c, &key, &nm);
if (IS_ERR(dent)) {
err = PTR_ERR(dent);
@@ -499,7 +605,7 @@ static int ubifs_readdir(struct file *file, struct dir_context *ctx)
* Find the entry corresponding to @ctx->pos or the closest one.
*/
dent_key_init_hash(c, &key, dir->i_ino, ctx->pos);
- nm.name = NULL;
+ fname_len(&nm) = 0;
dent = ubifs_tnc_next_ent(c, &key, &nm);
if (IS_ERR(dent)) {
err = PTR_ERR(dent);
@@ -516,15 +622,33 @@ static int ubifs_readdir(struct file *file, struct dir_context *ctx)
ubifs_assert(le64_to_cpu(dent->ch.sqnum) >
ubifs_inode(dir)->creat_sqnum);
- nm.len = le16_to_cpu(dent->nlen);
- if (!dir_emit(ctx, dent->name, nm.len,
+ fname_len(&nm) = le16_to_cpu(dent->nlen);
+ fname_name(&nm) = dent->name;
+
+ if (encrypted) {
+ fstr.len = fstr_real_len;
+
+ err = fscrypt_fname_disk_to_usr(dir, key_hash_flash(c,
+ &dent->key),
+ le32_to_cpu(dent->cookie),
+ &nm.disk_name, &fstr);
+ if (err)
+ goto out;
+ } else {
+ fstr.len = fname_len(&nm);
+ fstr.name = fname_name(&nm);
+ }
+
+ if (!dir_emit(ctx, fstr.name, fstr.len,
le64_to_cpu(dent->inum),
- vfs_dent_type(dent->type)))
+ vfs_dent_type(dent->type))) {
+ if (encrypted)
+ fscrypt_fname_free_buffer(&fstr);
return 0;
+ }
/* Switch to the next entry */
key_read(c, &dent->key, &key);
- nm.name = dent->name;
dent = ubifs_tnc_next_ent(c, &key, &nm);
if (IS_ERR(dent)) {
err = PTR_ERR(dent);
@@ -541,6 +665,9 @@ out:
kfree(file->private_data);
file->private_data = NULL;
+ if (encrypted)
+ fscrypt_fname_free_buffer(&fstr);
+
if (err != -ENOENT)
ubifs_err(c, "cannot find next direntry, error %d", err);
else
@@ -601,6 +728,7 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
int err, sz_change = CALC_DENT_SIZE(dentry->d_name.len);
struct ubifs_budget_req req = { .new_dent = 1, .dirtied_ino = 2,
.dirtied_ino_d = ALIGN(ui->data_len, 8) };
+ struct fscrypt_name nm;
/*
* Budget request settings: new direntry, changing the target inode,
@@ -613,13 +741,29 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
ubifs_assert(inode_is_locked(dir));
ubifs_assert(inode_is_locked(inode));
- err = dbg_check_synced_i_size(c, inode);
+ if (ubifs_crypt_is_encrypted(dir)) {
+ if (!fscrypt_has_permitted_context(dir, inode))
+ return -EPERM;
+
+ err = fscrypt_get_encryption_info(inode);
+ if (err)
+ return err;
+
+ if (!fscrypt_has_encryption_key(inode))
+ return -EPERM;
+ }
+
+ err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &nm);
if (err)
return err;
+ err = dbg_check_synced_i_size(c, inode);
+ if (err)
+ goto out_fname;
+
err = ubifs_budget_space(c, &req);
if (err)
- return err;
+ goto out_fname;
lock_2_inodes(dir, inode);
inc_nlink(inode);
@@ -628,13 +772,14 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
dir->i_size += sz_change;
dir_ui->ui_size = dir->i_size;
dir->i_mtime = dir->i_ctime = inode->i_ctime;
- err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0);
+ err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0);
if (err)
goto out_cancel;
unlock_2_inodes(dir, inode);
ubifs_release_budget(c, &req);
d_instantiate(dentry, inode);
+ fscrypt_free_filename(&nm);
return 0;
out_cancel:
@@ -644,6 +789,8 @@ out_cancel:
unlock_2_inodes(dir, inode);
ubifs_release_budget(c, &req);
iput(inode);
+out_fname:
+ fscrypt_free_filename(&nm);
return err;
}
@@ -652,10 +799,10 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry)
struct ubifs_info *c = dir->i_sb->s_fs_info;
struct inode *inode = d_inode(dentry);
struct ubifs_inode *dir_ui = ubifs_inode(dir);
- int sz_change = CALC_DENT_SIZE(dentry->d_name.len);
- int err, budgeted = 1;
+ int err, sz_change, budgeted = 1;
struct ubifs_budget_req req = { .mod_dent = 1, .dirtied_ino = 2 };
unsigned int saved_nlink = inode->i_nlink;
+ struct fscrypt_name nm;
/*
* Budget request settings: deletion direntry, deletion inode (+1 for
@@ -667,16 +814,29 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry)
dbg_gen("dent '%pd' from ino %lu (nlink %d) in dir ino %lu",
dentry, inode->i_ino,
inode->i_nlink, dir->i_ino);
+
+ if (ubifs_crypt_is_encrypted(dir)) {
+ err = fscrypt_get_encryption_info(dir);
+ if (err && err != -ENOKEY)
+ return err;
+ }
+
+ err = fscrypt_setup_filename(dir, &dentry->d_name, 1, &nm);
+ if (err)
+ return err;
+
+ sz_change = CALC_DENT_SIZE(fname_len(&nm));
+
ubifs_assert(inode_is_locked(dir));
ubifs_assert(inode_is_locked(inode));
err = dbg_check_synced_i_size(c, inode);
if (err)
- return err;
+ goto out_fname;
err = ubifs_budget_space(c, &req);
if (err) {
if (err != -ENOSPC)
- return err;
+ goto out_fname;
budgeted = 0;
}
@@ -686,7 +846,7 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry)
dir->i_size -= sz_change;
dir_ui->ui_size = dir->i_size;
dir->i_mtime = dir->i_ctime = inode->i_ctime;
- err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 1, 0);
+ err = ubifs_jnl_update(c, dir, &nm, inode, 1, 0);
if (err)
goto out_cancel;
unlock_2_inodes(dir, inode);
@@ -698,6 +858,7 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry)
c->bi.nospace = c->bi.nospace_rp = 0;
smp_wmb();
}
+ fscrypt_free_filename(&nm);
return 0;
out_cancel:
@@ -707,21 +868,23 @@ out_cancel:
unlock_2_inodes(dir, inode);
if (budgeted)
ubifs_release_budget(c, &req);
+out_fname:
+ fscrypt_free_filename(&nm);
return err;
}
/**
* check_dir_empty - check if a directory is empty or not.
- * @c: UBIFS file-system description object
* @dir: VFS inode object of the directory to check
*
* This function checks if directory @dir is empty. Returns zero if the
* directory is empty, %-ENOTEMPTY if it is not, and other negative error codes
* in case of of errors.
*/
-static int check_dir_empty(struct ubifs_info *c, struct inode *dir)
+int ubifs_check_dir_empty(struct inode *dir)
{
- struct qstr nm = { .name = NULL };
+ struct ubifs_info *c = dir->i_sb->s_fs_info;
+ struct fscrypt_name nm = { 0 };
struct ubifs_dent_node *dent;
union ubifs_key key;
int err;
@@ -743,10 +906,10 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry)
{
struct ubifs_info *c = dir->i_sb->s_fs_info;
struct inode *inode = d_inode(dentry);
- int sz_change = CALC_DENT_SIZE(dentry->d_name.len);
- int err, budgeted = 1;
+ int err, sz_change, budgeted = 1;
struct ubifs_inode *dir_ui = ubifs_inode(dir);
struct ubifs_budget_req req = { .mod_dent = 1, .dirtied_ino = 2 };
+ struct fscrypt_name nm;
/*
* Budget request settings: deletion direntry, deletion inode and
@@ -758,14 +921,26 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry)
inode->i_ino, dir->i_ino);
ubifs_assert(inode_is_locked(dir));
ubifs_assert(inode_is_locked(inode));
- err = check_dir_empty(c, d_inode(dentry));
+ err = ubifs_check_dir_empty(d_inode(dentry));
if (err)
return err;
+ if (ubifs_crypt_is_encrypted(dir)) {
+ err = fscrypt_get_encryption_info(dir);
+ if (err && err != -ENOKEY)
+ return err;
+ }
+
+ err = fscrypt_setup_filename(dir, &dentry->d_name, 1, &nm);
+ if (err)
+ return err;
+
+ sz_change = CALC_DENT_SIZE(fname_len(&nm));
+
err = ubifs_budget_space(c, &req);
if (err) {
if (err != -ENOSPC)
- return err;
+ goto out_fname;
budgeted = 0;
}
@@ -776,7 +951,7 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry)
dir->i_size -= sz_change;
dir_ui->ui_size = dir->i_size;
dir->i_mtime = dir->i_ctime = inode->i_ctime;
- err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 1, 0);
+ err = ubifs_jnl_update(c, dir, &nm, inode, 1, 0);
if (err)
goto out_cancel;
unlock_2_inodes(dir, inode);
@@ -788,6 +963,7 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry)
c->bi.nospace = c->bi.nospace_rp = 0;
smp_wmb();
}
+ fscrypt_free_filename(&nm);
return 0;
out_cancel:
@@ -798,6 +974,8 @@ out_cancel:
unlock_2_inodes(dir, inode);
if (budgeted)
ubifs_release_budget(c, &req);
+out_fname:
+ fscrypt_free_filename(&nm);
return err;
}
@@ -806,8 +984,9 @@ static int ubifs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
struct inode *inode;
struct ubifs_inode *dir_ui = ubifs_inode(dir);
struct ubifs_info *c = dir->i_sb->s_fs_info;
- int err, sz_change = CALC_DENT_SIZE(dentry->d_name.len);
+ int err, sz_change;
struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1 };
+ struct fscrypt_name nm;
/*
* Budget request settings: new inode, new direntry and changing parent
@@ -821,10 +1000,27 @@ static int ubifs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
if (err)
return err;
+ if (ubifs_crypt_is_encrypted(dir)) {
+ err = fscrypt_get_encryption_info(dir);
+ if (err)
+ goto out_budg;
+
+ if (!fscrypt_has_encryption_key(dir)) {
+ err = -EPERM;
+ goto out_budg;
+ }
+ }
+
+ err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &nm);
+ if (err)
+ goto out_budg;
+
+ sz_change = CALC_DENT_SIZE(fname_len(&nm));
+
inode = ubifs_new_inode(c, dir, S_IFDIR | mode);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
- goto out_budg;
+ goto out_fname;
}
err = ubifs_init_security(dir, inode, &dentry->d_name);
@@ -838,7 +1034,7 @@ static int ubifs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
dir->i_size += sz_change;
dir_ui->ui_size = dir->i_size;
dir->i_mtime = dir->i_ctime = inode->i_ctime;
- err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0);
+ err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0);
if (err) {
ubifs_err(c, "cannot create directory, error %d", err);
goto out_cancel;
@@ -847,6 +1043,7 @@ static int ubifs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
ubifs_release_budget(c, &req);
d_instantiate(dentry, inode);
+ fscrypt_free_filename(&nm);
return 0;
out_cancel:
@@ -857,6 +1054,8 @@ out_cancel:
out_inode:
make_bad_inode(inode);
iput(inode);
+out_fname:
+ fscrypt_free_filename(&nm);
out_budg:
ubifs_release_budget(c, &req);
return err;
@@ -870,11 +1069,12 @@ static int ubifs_mknod(struct inode *dir, struct dentry *dentry,
struct ubifs_inode *dir_ui = ubifs_inode(dir);
struct ubifs_info *c = dir->i_sb->s_fs_info;
union ubifs_dev_desc *dev = NULL;
- int sz_change = CALC_DENT_SIZE(dentry->d_name.len);
+ int sz_change;
int err, devlen = 0;
struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
.new_ino_d = ALIGN(devlen, 8),
.dirtied_ino = 1 };
+ struct fscrypt_name nm;
/*
* Budget request settings: new inode, new direntry and changing parent
@@ -896,11 +1096,28 @@ static int ubifs_mknod(struct inode *dir, struct dentry *dentry,
return err;
}
+ if (ubifs_crypt_is_encrypted(dir)) {
+ err = fscrypt_get_encryption_info(dir);
+ if (err)
+ goto out_budg;
+
+ if (!fscrypt_has_encryption_key(dir)) {
+ err = -EPERM;
+ goto out_budg;
+ }
+ }
+
+ err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &nm);
+ if (err)
+ goto out_budg;
+
+ sz_change = CALC_DENT_SIZE(fname_len(&nm));
+
inode = ubifs_new_inode(c, dir, mode);
if (IS_ERR(inode)) {
kfree(dev);
err = PTR_ERR(inode);
- goto out_budg;
+ goto out_fname;
}
init_special_inode(inode, inode->i_mode, rdev);
@@ -917,7 +1134,7 @@ static int ubifs_mknod(struct inode *dir, struct dentry *dentry,
dir->i_size += sz_change;
dir_ui->ui_size = dir->i_size;
dir->i_mtime = dir->i_ctime = inode->i_ctime;
- err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0);
+ err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0);
if (err)
goto out_cancel;
mutex_unlock(&dir_ui->ui_mutex);
@@ -925,6 +1142,7 @@ static int ubifs_mknod(struct inode *dir, struct dentry *dentry,
ubifs_release_budget(c, &req);
insert_inode_hash(inode);
d_instantiate(dentry, inode);
+ fscrypt_free_filename(&nm);
return 0;
out_cancel:
@@ -934,6 +1152,8 @@ out_cancel:
out_inode:
make_bad_inode(inode);
iput(inode);
+out_fname:
+ fscrypt_free_filename(&nm);
out_budg:
ubifs_release_budget(c, &req);
return err;
@@ -947,10 +1167,27 @@ static int ubifs_symlink(struct inode *dir, struct dentry *dentry,
struct ubifs_inode *dir_ui = ubifs_inode(dir);
struct ubifs_info *c = dir->i_sb->s_fs_info;
int err, len = strlen(symname);
- int sz_change = CALC_DENT_SIZE(dentry->d_name.len);
+ int sz_change = CALC_DENT_SIZE(len);
+ struct fscrypt_str disk_link = FSTR_INIT((char *)symname, len + 1);
+ struct fscrypt_symlink_data *sd = NULL;
struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
.new_ino_d = ALIGN(len, 8),
.dirtied_ino = 1 };
+ struct fscrypt_name nm;
+
+ if (ubifs_crypt_is_encrypted(dir)) {
+ err = fscrypt_get_encryption_info(dir);
+ if (err)
+ goto out_budg;
+
+ if (!fscrypt_has_encryption_key(dir)) {
+ err = -EPERM;
+ goto out_budg;
+ }
+
+ disk_link.len = (fscrypt_fname_encrypted_size(dir, len) +
+ sizeof(struct fscrypt_symlink_data));
+ }
/*
* Budget request settings: new inode, new direntry and changing parent
@@ -960,36 +1197,77 @@ static int ubifs_symlink(struct inode *dir, struct dentry *dentry,
dbg_gen("dent '%pd', target '%s' in dir ino %lu", dentry,
symname, dir->i_ino);
- if (len > UBIFS_MAX_INO_DATA)
+ if (disk_link.len > UBIFS_MAX_INO_DATA)
return -ENAMETOOLONG;
err = ubifs_budget_space(c, &req);
if (err)
return err;
+ err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &nm);
+ if (err)
+ goto out_budg;
+
inode = ubifs_new_inode(c, dir, S_IFLNK | S_IRWXUGO);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
- goto out_budg;
+ goto out_fname;
}
ui = ubifs_inode(inode);
- ui->data = kmalloc(len + 1, GFP_NOFS);
+ ui->data = kmalloc(disk_link.len, GFP_NOFS);
if (!ui->data) {
err = -ENOMEM;
goto out_inode;
}
- memcpy(ui->data, symname, len);
- ((char *)ui->data)[len] = '\0';
- inode->i_link = ui->data;
+ if (ubifs_crypt_is_encrypted(dir)) {
+ struct qstr istr = QSTR_INIT(symname, len);
+ struct fscrypt_str ostr;
+
+ sd = kzalloc(disk_link.len, GFP_NOFS);
+ if (!sd) {
+ err = -ENOMEM;
+ goto out_inode;
+ }
+
+ err = fscrypt_get_encryption_info(inode);
+ if (err) {
+ kfree(sd);
+ goto out_inode;
+ }
+
+ if (!fscrypt_has_encryption_key(inode)) {
+ kfree(sd);
+ err = -EPERM;
+ goto out_inode;
+ }
+
+ ostr.name = sd->encrypted_path;
+ ostr.len = disk_link.len;
+
+ err = fscrypt_fname_usr_to_disk(inode, &istr, &ostr);
+ if (err) {
+ kfree(sd);
+ goto out_inode;
+ }
+
+ sd->len = cpu_to_le16(ostr.len);
+ disk_link.name = (char *)sd;
+ } else {
+ inode->i_link = ui->data;
+ }
+
+ memcpy(ui->data, disk_link.name, disk_link.len);
+ ((char *)ui->data)[disk_link.len - 1] = '\0';
+
/*
* The terminating zero byte is not written to the flash media and it
* is put just to make later in-memory string processing simpler. Thus,
* data length is @len, not @len + %1.
*/
- ui->data_len = len;
- inode->i_size = ubifs_inode(inode)->ui_size = len;
+ ui->data_len = disk_link.len - 1;
+ inode->i_size = ubifs_inode(inode)->ui_size = disk_link.len - 1;
err = ubifs_init_security(dir, inode, &dentry->d_name);
if (err)
@@ -999,7 +1277,7 @@ static int ubifs_symlink(struct inode *dir, struct dentry *dentry,
dir->i_size += sz_change;
dir_ui->ui_size = dir->i_size;
dir->i_mtime = dir->i_ctime = inode->i_ctime;
- err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0);
+ err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0);
if (err)
goto out_cancel;
mutex_unlock(&dir_ui->ui_mutex);
@@ -1007,6 +1285,7 @@ static int ubifs_symlink(struct inode *dir, struct dentry *dentry,
ubifs_release_budget(c, &req);
insert_inode_hash(inode);
d_instantiate(dentry, inode);
+ fscrypt_free_filename(&nm);
return 0;
out_cancel:
@@ -1016,6 +1295,8 @@ out_cancel:
out_inode:
make_bad_inode(inode);
iput(inode);
+out_fname:
+ fscrypt_free_filename(&nm);
out_budg:
ubifs_release_budget(c, &req);
return err;
@@ -1078,15 +1359,14 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry,
struct ubifs_inode *whiteout_ui = NULL;
int err, release, sync = 0, move = (new_dir != old_dir);
int is_dir = S_ISDIR(old_inode->i_mode);
- int unlink = !!new_inode;
- int new_sz = CALC_DENT_SIZE(new_dentry->d_name.len);
- int old_sz = CALC_DENT_SIZE(old_dentry->d_name.len);
+ int unlink = !!new_inode, new_sz, old_sz;
struct ubifs_budget_req req = { .new_dent = 1, .mod_dent = 1,
.dirtied_ino = 3 };
struct ubifs_budget_req ino_req = { .dirtied_ino = 1,
.dirtied_ino_d = ALIGN(old_inode_ui->data_len, 8) };
struct timespec time;
unsigned int uninitialized_var(saved_nlink);
+ struct fscrypt_name old_nm, new_nm;
if (flags & ~RENAME_NOREPLACE)
return -EINVAL;
@@ -1107,17 +1387,41 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry,
if (unlink)
ubifs_assert(inode_is_locked(new_inode));
+ if (old_dir != new_dir) {
+ if (ubifs_crypt_is_encrypted(new_dir) &&
+ !fscrypt_has_permitted_context(new_dir, old_inode))
+ return -EPERM;
+ }
+
if (unlink && is_dir) {
- err = check_dir_empty(c, new_inode);
+ err = ubifs_check_dir_empty(new_inode);
if (err)
return err;
}
- err = ubifs_budget_space(c, &req);
+ err = fscrypt_setup_filename(old_dir, &old_dentry->d_name, 0, &old_nm);
if (err)
return err;
+
+ err = fscrypt_setup_filename(new_dir, &new_dentry->d_name, 0, &new_nm);
+ if (err) {
+ fscrypt_free_filename(&old_nm);
+ return err;
+ }
+
+ new_sz = CALC_DENT_SIZE(fname_len(&new_nm));
+ old_sz = CALC_DENT_SIZE(fname_len(&old_nm));
+
+ err = ubifs_budget_space(c, &req);
+ if (err) {
+ fscrypt_free_filename(&old_nm);
+ fscrypt_free_filename(&new_nm);
+ return err;
+ }
err = ubifs_budget_space(c, &ino_req);
if (err) {
+ fscrypt_free_filename(&old_nm);
+ fscrypt_free_filename(&new_nm);
ubifs_release_budget(c, &req);
return err;
}
@@ -1239,8 +1543,8 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry,
iput(whiteout);
}
- err = ubifs_jnl_rename(c, old_dir, old_dentry, new_dir, new_dentry, whiteout,
- sync);
+ err = ubifs_jnl_rename(c, old_dir, old_inode, &old_nm, new_dir,
+ new_inode, &new_nm, whiteout, sync);
if (err)
goto out_cancel;
@@ -1256,6 +1560,9 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry,
ubifs_release_budget(c, &ino_req);
if (IS_SYNC(old_inode))
err = old_inode->i_sb->s_op->write_inode(old_inode, NULL);
+
+ fscrypt_free_filename(&old_nm);
+ fscrypt_free_filename(&new_nm);
return err;
out_cancel:
@@ -1284,6 +1591,8 @@ out_cancel:
unlock_4_inodes(old_dir, new_dir, new_inode, whiteout);
ubifs_release_budget(c, &ino_req);
ubifs_release_budget(c, &req);
+ fscrypt_free_filename(&old_nm);
+ fscrypt_free_filename(&new_nm);
return err;
}
@@ -1298,9 +1607,27 @@ static int ubifs_xrename(struct inode *old_dir, struct dentry *old_dentry,
struct inode *snd_inode = d_inode(new_dentry);
struct timespec time;
int err;
+ struct fscrypt_name fst_nm, snd_nm;
ubifs_assert(fst_inode && snd_inode);
+ if ((ubifs_crypt_is_encrypted(old_dir) ||
+ ubifs_crypt_is_encrypted(new_dir)) &&
+ (old_dir != new_dir) &&
+ (!fscrypt_has_permitted_context(new_dir, fst_inode) ||
+ !fscrypt_has_permitted_context(old_dir, snd_inode)))
+ return -EPERM;
+
+ err = fscrypt_setup_filename(old_dir, &old_dentry->d_name, 0, &fst_nm);
+ if (err)
+ return err;
+
+ err = fscrypt_setup_filename(new_dir, &new_dentry->d_name, 0, &snd_nm);
+ if (err) {
+ fscrypt_free_filename(&fst_nm);
+ return err;
+ }
+
lock_4_inodes(old_dir, new_dir, NULL, NULL);
time = ubifs_current_time(old_dir);
@@ -1320,12 +1647,14 @@ static int ubifs_xrename(struct inode *old_dir, struct dentry *old_dentry,
}
}
- err = ubifs_jnl_xrename(c, old_dir, old_dentry, new_dir, new_dentry,
- sync);
+ err = ubifs_jnl_xrename(c, old_dir, fst_inode, &fst_nm, new_dir,
+ snd_inode, &snd_nm, sync);
unlock_4_inodes(old_dir, new_dir, NULL, NULL);
ubifs_release_budget(c, &req);
+ fscrypt_free_filename(&fst_nm);
+ fscrypt_free_filename(&snd_nm);
return err;
}
@@ -1384,6 +1713,14 @@ int ubifs_getattr(struct vfsmount *mnt, struct dentry *dentry,
return 0;
}
+static int ubifs_dir_open(struct inode *dir, struct file *file)
+{
+ if (ubifs_crypt_is_encrypted(dir))
+ return fscrypt_get_encryption_info(dir) ? -EACCES : 0;
+
+ return 0;
+}
+
const struct inode_operations ubifs_dir_inode_operations = {
.lookup = ubifs_lookup,
.create = ubifs_create,
@@ -1410,6 +1747,7 @@ const struct file_operations ubifs_dir_operations = {
.iterate_shared = ubifs_readdir,
.fsync = ubifs_fsync,
.unlocked_ioctl = ubifs_ioctl,
+ .open = ubifs_dir_open,
#ifdef CONFIG_COMPAT
.compat_ioctl = ubifs_compat_ioctl,
#endif
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index b4fbeefba246..aa0625f4f642 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -78,6 +78,13 @@ static int read_block(struct inode *inode, void *addr, unsigned int block,
goto dump;
dlen = le32_to_cpu(dn->ch.len) - UBIFS_DATA_NODE_SZ;
+
+ if (ubifs_crypt_is_encrypted(inode)) {
+ err = ubifs_decrypt(inode, dn, &dlen, block);
+ if (err)
+ goto dump;
+ }
+
out_len = UBIFS_BLOCK_SIZE;
err = ubifs_decompress(c, &dn->data, dlen, addr, &out_len,
le16_to_cpu(dn->compr_type));
@@ -650,6 +657,13 @@ static int populate_page(struct ubifs_info *c, struct page *page,
dlen = le32_to_cpu(dn->ch.len) - UBIFS_DATA_NODE_SZ;
out_len = UBIFS_BLOCK_SIZE;
+
+ if (ubifs_crypt_is_encrypted(inode)) {
+ err = ubifs_decrypt(inode, dn, &dlen, page_block);
+ if (err)
+ goto out_err;
+ }
+
err = ubifs_decompress(c, &dn->data, dlen, addr, &out_len,
le16_to_cpu(dn->compr_type));
if (err || len != out_len)
@@ -1594,6 +1608,15 @@ static const struct vm_operations_struct ubifs_file_vm_ops = {
static int ubifs_file_mmap(struct file *file, struct vm_area_struct *vma)
{
int err;
+ struct inode *inode = file->f_mapping->host;
+
+ if (ubifs_crypt_is_encrypted(inode)) {
+ err = fscrypt_get_encryption_info(inode);
+ if (err)
+ return -EACCES;
+ if (!fscrypt_has_encryption_key(inode))
+ return -ENOKEY;
+ }
err = generic_file_mmap(file, vma);
if (err)
@@ -1605,6 +1628,88 @@ static int ubifs_file_mmap(struct file *file, struct vm_area_struct *vma)
return 0;
}
+static int ubifs_file_open(struct inode *inode, struct file *filp)
+{
+ int ret;
+ struct dentry *dir;
+ struct ubifs_info *c = inode->i_sb->s_fs_info;
+
+ if (ubifs_crypt_is_encrypted(inode)) {
+ ret = fscrypt_get_encryption_info(inode);
+ if (ret)
+ return -EACCES;
+ if (!fscrypt_has_encryption_key(inode))
+ return -ENOKEY;
+ }
+
+ dir = dget_parent(file_dentry(filp));
+ if (ubifs_crypt_is_encrypted(d_inode(dir)) &&
+ !fscrypt_has_permitted_context(d_inode(dir), inode)) {
+ ubifs_err(c, "Inconsistent encryption contexts: %lu/%lu",
+ (unsigned long) d_inode(dir)->i_ino,
+ (unsigned long) inode->i_ino);
+ dput(dir);
+ ubifs_ro_mode(c, -EPERM);
+ return -EPERM;
+ }
+ dput(dir);
+
+ return 0;
+}
+
+static const char *ubifs_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
+{
+ int err;
+ struct fscrypt_symlink_data *sd;
+ struct ubifs_inode *ui = ubifs_inode(inode);
+ struct fscrypt_str cstr;
+ struct fscrypt_str pstr;
+
+ if (!ubifs_crypt_is_encrypted(inode))
+ return ui->data;
+
+ if (!dentry)
+ return ERR_PTR(-ECHILD);
+
+ err = fscrypt_get_encryption_info(inode);
+ if (err)
+ return ERR_PTR(err);
+
+ sd = (struct fscrypt_symlink_data *)ui->data;
+ cstr.name = sd->encrypted_path;
+ cstr.len = le16_to_cpu(sd->len);
+
+ if (cstr.len == 0)
+ return ERR_PTR(-ENOENT);
+
+ if ((cstr.len + sizeof(struct fscrypt_symlink_data) - 1) > ui->data_len)
+ return ERR_PTR(-EIO);
+
+ err = fscrypt_fname_alloc_buffer(inode, cstr.len, &pstr);
+ if (err)
+ return ERR_PTR(err);
+
+ err = fscrypt_fname_disk_to_usr(inode, 0, 0, &cstr, &pstr);
+ if (err) {
+ fscrypt_fname_free_buffer(&pstr);
+ return ERR_PTR(err);
+ }
+
+ pstr.name[pstr.len] = '\0';
+
+ // XXX this probably won't happen anymore...
+ if (pstr.name[0] == '\0') {
+ fscrypt_fname_free_buffer(&pstr);
+ return ERR_PTR(-ENOENT);
+ }
+
+ set_delayed_call(done, kfree_link, pstr.name);
+ return pstr.name;
+}
+
+
const struct address_space_operations ubifs_file_address_operations = {
.readpage = ubifs_readpage,
.writepage = ubifs_writepage,
@@ -1629,7 +1734,7 @@ const struct inode_operations ubifs_file_inode_operations = {
const struct inode_operations ubifs_symlink_inode_operations = {
.readlink = generic_readlink,
- .get_link = simple_get_link,
+ .get_link = ubifs_get_link,
.setattr = ubifs_setattr,
.getattr = ubifs_getattr,
.listxattr = ubifs_listxattr,
@@ -1647,6 +1752,7 @@ const struct file_operations ubifs_file_operations = {
.unlocked_ioctl = ubifs_ioctl,
.splice_read = generic_file_splice_read,
.splice_write = iter_file_splice_write,
+ .open = ubifs_file_open,
#ifdef CONFIG_COMPAT
.compat_ioctl = ubifs_compat_ioctl,
#endif
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index e845c64b6ce1..7b35e3d6cde7 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -846,10 +846,6 @@ int ubifs_gc_start_commit(struct ubifs_info *c)
*/
while (1) {
lp = ubifs_fast_find_freeable(c);
- if (IS_ERR(lp)) {
- err = PTR_ERR(lp);
- goto out;
- }
if (!lp)
break;
ubifs_assert(!(lp->flags & LPROPS_TAKEN));
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index 97be41215332..3be28900bf37 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -452,16 +452,22 @@ static enum hrtimer_restart wbuf_timer_callback_nolock(struct hrtimer *timer)
*/
static void new_wbuf_timer_nolock(struct ubifs_wbuf *wbuf)
{
+ ktime_t softlimit = ms_to_ktime(dirty_writeback_interval * 10);
+ unsigned long long delta = dirty_writeback_interval;
+
+ /* centi to milli, milli to nano, then 10% */
+ delta *= 10ULL * NSEC_PER_MSEC / 10ULL;
+
ubifs_assert(!hrtimer_active(&wbuf->timer));
+ ubifs_assert(delta <= ULONG_MAX);
if (wbuf->no_timer)
return;
dbg_io("set timer for jhead %s, %llu-%llu millisecs",
dbg_jhead(wbuf->jhead),
- div_u64(ktime_to_ns(wbuf->softlimit), USEC_PER_SEC),
- div_u64(ktime_to_ns(wbuf->softlimit) + wbuf->delta,
- USEC_PER_SEC));
- hrtimer_start_range_ns(&wbuf->timer, wbuf->softlimit, wbuf->delta,
+ div_u64(ktime_to_ns(softlimit), USEC_PER_SEC),
+ div_u64(ktime_to_ns(softlimit) + delta, USEC_PER_SEC));
+ hrtimer_start_range_ns(&wbuf->timer, softlimit, delta,
HRTIMER_MODE_REL);
}
@@ -1059,10 +1065,6 @@ int ubifs_wbuf_init(struct ubifs_info *c, struct ubifs_wbuf *wbuf)
hrtimer_init(&wbuf->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
wbuf->timer.function = wbuf_timer_callback_nolock;
- wbuf->softlimit = ktime_set(WBUF_TIMEOUT_SOFTLIMIT, 0);
- wbuf->delta = WBUF_TIMEOUT_HARDLIMIT - WBUF_TIMEOUT_SOFTLIMIT;
- wbuf->delta *= 1000000000ULL;
- ubifs_assert(wbuf->delta <= ULONG_MAX);
return 0;
}
diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c
index 3c7b29de0ca7..78d713644df3 100644
--- a/fs/ubifs/ioctl.c
+++ b/fs/ubifs/ioctl.c
@@ -181,6 +181,26 @@ long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
mnt_drop_write_file(file);
return err;
}
+ case FS_IOC_SET_ENCRYPTION_POLICY: {
+#ifdef CONFIG_UBIFS_FS_ENCRYPTION
+ struct ubifs_info *c = inode->i_sb->s_fs_info;
+
+ err = ubifs_enable_encryption(c);
+ if (err)
+ return err;
+
+ return fscrypt_ioctl_set_policy(file, (const void __user *)arg);
+#else
+ return -EOPNOTSUPP;
+#endif
+ }
+ case FS_IOC_GET_ENCRYPTION_POLICY: {
+#ifdef CONFIG_UBIFS_FS_ENCRYPTION
+ return fscrypt_ioctl_get_policy(file, (void __user *)arg);
+#else
+ return -EOPNOTSUPP;
+#endif
+ }
default:
return -ENOTTY;
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index 91bc76dc559e..a459211a1c21 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -78,16 +78,6 @@ static inline void zero_ino_node_unused(struct ubifs_ino_node *ino)
static inline void zero_dent_node_unused(struct ubifs_dent_node *dent)
{
dent->padding1 = 0;
- memset(dent->padding2, 0, 4);
-}
-
-/**
- * zero_data_node_unused - zero out unused fields of an on-flash data node.
- * @data: the data node to zero out
- */
-static inline void zero_data_node_unused(struct ubifs_data_node *data)
-{
- memset(data->padding, 0, 2);
}
/**
@@ -511,6 +501,14 @@ static void mark_inode_clean(struct ubifs_info *c, struct ubifs_inode *ui)
ui->dirty = 0;
}
+static void set_dent_cookie(struct ubifs_info *c, struct ubifs_dent_node *dent)
+{
+ if (c->double_hash)
+ dent->cookie = prandom_u32();
+ else
+ dent->cookie = 0;
+}
+
/**
* ubifs_jnl_update - update inode.
* @c: UBIFS file-system description object
@@ -539,7 +537,7 @@ static void mark_inode_clean(struct ubifs_info *c, struct ubifs_inode *ui)
* success. In case of failure, a negative error code is returned.
*/
int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir,
- const struct qstr *nm, const struct inode *inode,
+ const struct fscrypt_name *nm, const struct inode *inode,
int deletion, int xent)
{
int err, dlen, ilen, len, lnum, ino_offs, dent_offs;
@@ -551,11 +549,11 @@ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir,
struct ubifs_ino_node *ino;
union ubifs_key dent_key, ino_key;
- dbg_jnl("ino %lu, dent '%.*s', data len %d in dir ino %lu",
- inode->i_ino, nm->len, nm->name, ui->data_len, dir->i_ino);
+ //dbg_jnl("ino %lu, dent '%.*s', data len %d in dir ino %lu",
+ // inode->i_ino, nm->len, nm->name, ui->data_len, dir->i_ino);
ubifs_assert(mutex_is_locked(&host_ui->ui_mutex));
- dlen = UBIFS_DENT_NODE_SZ + nm->len + 1;
+ dlen = UBIFS_DENT_NODE_SZ + fname_len(nm) + 1;
ilen = UBIFS_INO_NODE_SZ;
/*
@@ -596,9 +594,11 @@ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir,
key_write(c, &dent_key, dent->key);
dent->inum = deletion ? 0 : cpu_to_le64(inode->i_ino);
dent->type = get_dent_type(inode->i_mode);
- dent->nlen = cpu_to_le16(nm->len);
- memcpy(dent->name, nm->name, nm->len);
- dent->name[nm->len] = '\0';
+ dent->nlen = cpu_to_le16(fname_len(nm));
+ memcpy(dent->name, fname_name(nm), fname_len(nm));
+ dent->name[fname_len(nm)] = '\0';
+ set_dent_cookie(c, dent);
+
zero_dent_node_unused(dent);
ubifs_prep_grp_node(c, dent, dlen, 0);
@@ -697,14 +697,18 @@ int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode,
const union ubifs_key *key, const void *buf, int len)
{
struct ubifs_data_node *data;
- int err, lnum, offs, compr_type, out_len;
+ int err, lnum, offs, compr_type, out_len, compr_len;
int dlen = COMPRESSED_DATA_NODE_BUF_SZ, allocated = 1;
struct ubifs_inode *ui = ubifs_inode(inode);
+ bool encrypted = ubifs_crypt_is_encrypted(inode);
dbg_jnlk(key, "ino %lu, blk %u, len %d, key ",
(unsigned long)key_inum(c, key), key_block(c, key), len);
ubifs_assert(len <= UBIFS_BLOCK_SIZE);
+ if (encrypted)
+ dlen += UBIFS_CIPHER_BLOCK_SIZE;
+
data = kmalloc(dlen, GFP_NOFS | __GFP_NOWARN);
if (!data) {
/*
@@ -722,7 +726,6 @@ int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode,
data->ch.node_type = UBIFS_DATA_NODE;
key_write(c, key, &data->key);
data->size = cpu_to_le32(len);
- zero_data_node_unused(data);
if (!(ui->flags & UBIFS_COMPR_FL))
/* Compression is disabled for this inode */
@@ -730,9 +733,18 @@ int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode,
else
compr_type = ui->compr_type;
- out_len = dlen - UBIFS_DATA_NODE_SZ;
- ubifs_compress(c, buf, len, &data->data, &out_len, &compr_type);
- ubifs_assert(out_len <= UBIFS_BLOCK_SIZE);
+ out_len = compr_len = dlen - UBIFS_DATA_NODE_SZ;
+ ubifs_compress(c, buf, len, &data->data, &compr_len, &compr_type);
+ ubifs_assert(compr_len <= UBIFS_BLOCK_SIZE);
+
+ if (encrypted) {
+ err = ubifs_encrypt(inode, data, compr_len, &out_len, key_block(c, key));
+ if (err)
+ goto out_free;
+
+ } else {
+ data->compr_size = 0;
+ }
dlen = UBIFS_DATA_NODE_SZ + out_len;
data->compr_type = cpu_to_le16(compr_type);
@@ -911,9 +923,11 @@ int ubifs_jnl_delete_inode(struct ubifs_info *c, const struct inode *inode)
* ubifs_jnl_xrename - cross rename two directory entries.
* @c: UBIFS file-system description object
* @fst_dir: parent inode of 1st directory entry to exchange
- * @fst_dentry: 1st directory entry to exchange
+ * @fst_inode: 1st inode to exchange
+ * @fst_nm: name of 1st inode to exchange
* @snd_dir: parent inode of 2nd directory entry to exchange
- * @snd_dentry: 2nd directory entry to exchange
+ * @snd_inode: 2nd inode to exchange
+ * @snd_nm: name of 2nd inode to exchange
* @sync: non-zero if the write-buffer has to be synchronized
*
* This function implements the cross rename operation which may involve
@@ -922,29 +936,29 @@ int ubifs_jnl_delete_inode(struct ubifs_info *c, const struct inode *inode)
* returned.
*/
int ubifs_jnl_xrename(struct ubifs_info *c, const struct inode *fst_dir,
- const struct dentry *fst_dentry,
+ const struct inode *fst_inode,
+ const struct fscrypt_name *fst_nm,
const struct inode *snd_dir,
- const struct dentry *snd_dentry, int sync)
+ const struct inode *snd_inode,
+ const struct fscrypt_name *snd_nm, int sync)
{
union ubifs_key key;
struct ubifs_dent_node *dent1, *dent2;
int err, dlen1, dlen2, lnum, offs, len, plen = UBIFS_INO_NODE_SZ;
int aligned_dlen1, aligned_dlen2;
int twoparents = (fst_dir != snd_dir);
- const struct inode *fst_inode = d_inode(fst_dentry);
- const struct inode *snd_inode = d_inode(snd_dentry);
void *p;
- dbg_jnl("dent '%pd' in dir ino %lu between dent '%pd' in dir ino %lu",
- fst_dentry, fst_dir->i_ino, snd_dentry, snd_dir->i_ino);
+ //dbg_jnl("dent '%pd' in dir ino %lu between dent '%pd' in dir ino %lu",
+ // fst_dentry, fst_dir->i_ino, snd_dentry, snd_dir->i_ino);
ubifs_assert(ubifs_inode(fst_dir)->data_len == 0);
ubifs_assert(ubifs_inode(snd_dir)->data_len == 0);
ubifs_assert(mutex_is_locked(&ubifs_inode(fst_dir)->ui_mutex));
ubifs_assert(mutex_is_locked(&ubifs_inode(snd_dir)->ui_mutex));
- dlen1 = UBIFS_DENT_NODE_SZ + snd_dentry->d_name.len + 1;
- dlen2 = UBIFS_DENT_NODE_SZ + fst_dentry->d_name.len + 1;
+ dlen1 = UBIFS_DENT_NODE_SZ + fname_len(snd_nm) + 1;
+ dlen2 = UBIFS_DENT_NODE_SZ + fname_len(fst_nm) + 1;
aligned_dlen1 = ALIGN(dlen1, 8);
aligned_dlen2 = ALIGN(dlen2, 8);
@@ -963,24 +977,24 @@ int ubifs_jnl_xrename(struct ubifs_info *c, const struct inode *fst_dir,
/* Make new dent for 1st entry */
dent1->ch.node_type = UBIFS_DENT_NODE;
- dent_key_init_flash(c, &dent1->key, snd_dir->i_ino, &snd_dentry->d_name);
+ dent_key_init_flash(c, &dent1->key, snd_dir->i_ino, snd_nm);
dent1->inum = cpu_to_le64(fst_inode->i_ino);
dent1->type = get_dent_type(fst_inode->i_mode);
- dent1->nlen = cpu_to_le16(snd_dentry->d_name.len);
- memcpy(dent1->name, snd_dentry->d_name.name, snd_dentry->d_name.len);
- dent1->name[snd_dentry->d_name.len] = '\0';
+ dent1->nlen = cpu_to_le16(fname_len(snd_nm));
+ memcpy(dent1->name, fname_name(snd_nm), fname_len(snd_nm));
+ dent1->name[fname_len(snd_nm)] = '\0';
zero_dent_node_unused(dent1);
ubifs_prep_grp_node(c, dent1, dlen1, 0);
/* Make new dent for 2nd entry */
dent2 = (void *)dent1 + aligned_dlen1;
dent2->ch.node_type = UBIFS_DENT_NODE;
- dent_key_init_flash(c, &dent2->key, fst_dir->i_ino, &fst_dentry->d_name);
+ dent_key_init_flash(c, &dent2->key, fst_dir->i_ino, fst_nm);
dent2->inum = cpu_to_le64(snd_inode->i_ino);
dent2->type = get_dent_type(snd_inode->i_mode);
- dent2->nlen = cpu_to_le16(fst_dentry->d_name.len);
- memcpy(dent2->name, fst_dentry->d_name.name, fst_dentry->d_name.len);
- dent2->name[fst_dentry->d_name.len] = '\0';
+ dent2->nlen = cpu_to_le16(fname_len(fst_nm));
+ memcpy(dent2->name, fname_name(fst_nm), fname_len(fst_nm));
+ dent2->name[fname_len(fst_nm)] = '\0';
zero_dent_node_unused(dent2);
ubifs_prep_grp_node(c, dent2, dlen2, 0);
@@ -1004,14 +1018,14 @@ int ubifs_jnl_xrename(struct ubifs_info *c, const struct inode *fst_dir,
}
release_head(c, BASEHD);
- dent_key_init(c, &key, snd_dir->i_ino, &snd_dentry->d_name);
- err = ubifs_tnc_add_nm(c, &key, lnum, offs, dlen1, &snd_dentry->d_name);
+ dent_key_init(c, &key, snd_dir->i_ino, snd_nm);
+ err = ubifs_tnc_add_nm(c, &key, lnum, offs, dlen1, snd_nm);
if (err)
goto out_ro;
offs += aligned_dlen1;
- dent_key_init(c, &key, fst_dir->i_ino, &fst_dentry->d_name);
- err = ubifs_tnc_add_nm(c, &key, lnum, offs, dlen2, &fst_dentry->d_name);
+ dent_key_init(c, &key, fst_dir->i_ino, fst_nm);
+ err = ubifs_tnc_add_nm(c, &key, lnum, offs, dlen2, fst_nm);
if (err)
goto out_ro;
@@ -1063,31 +1077,31 @@ out_free:
* returned.
*/
int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
- const struct dentry *old_dentry,
+ const struct inode *old_inode,
+ const struct fscrypt_name *old_nm,
const struct inode *new_dir,
- const struct dentry *new_dentry,
+ const struct inode *new_inode,
+ const struct fscrypt_name *new_nm,
const struct inode *whiteout, int sync)
{
void *p;
union ubifs_key key;
struct ubifs_dent_node *dent, *dent2;
int err, dlen1, dlen2, ilen, lnum, offs, len;
- const struct inode *old_inode = d_inode(old_dentry);
- const struct inode *new_inode = d_inode(new_dentry);
int aligned_dlen1, aligned_dlen2, plen = UBIFS_INO_NODE_SZ;
int last_reference = !!(new_inode && new_inode->i_nlink == 0);
int move = (old_dir != new_dir);
struct ubifs_inode *uninitialized_var(new_ui);
- dbg_jnl("dent '%pd' in dir ino %lu to dent '%pd' in dir ino %lu",
- old_dentry, old_dir->i_ino, new_dentry, new_dir->i_ino);
+ //dbg_jnl("dent '%pd' in dir ino %lu to dent '%pd' in dir ino %lu",
+ // old_dentry, old_dir->i_ino, new_dentry, new_dir->i_ino);
ubifs_assert(ubifs_inode(old_dir)->data_len == 0);
ubifs_assert(ubifs_inode(new_dir)->data_len == 0);
ubifs_assert(mutex_is_locked(&ubifs_inode(old_dir)->ui_mutex));
ubifs_assert(mutex_is_locked(&ubifs_inode(new_dir)->ui_mutex));
- dlen1 = UBIFS_DENT_NODE_SZ + new_dentry->d_name.len + 1;
- dlen2 = UBIFS_DENT_NODE_SZ + old_dentry->d_name.len + 1;
+ dlen1 = UBIFS_DENT_NODE_SZ + fname_len(new_nm) + 1;
+ dlen2 = UBIFS_DENT_NODE_SZ + fname_len(old_nm) + 1;
if (new_inode) {
new_ui = ubifs_inode(new_inode);
ubifs_assert(mutex_is_locked(&new_ui->ui_mutex));
@@ -1113,19 +1127,19 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
/* Make new dent */
dent->ch.node_type = UBIFS_DENT_NODE;
- dent_key_init_flash(c, &dent->key, new_dir->i_ino, &new_dentry->d_name);
+ dent_key_init_flash(c, &dent->key, new_dir->i_ino, new_nm);
dent->inum = cpu_to_le64(old_inode->i_ino);
dent->type = get_dent_type(old_inode->i_mode);
- dent->nlen = cpu_to_le16(new_dentry->d_name.len);
- memcpy(dent->name, new_dentry->d_name.name, new_dentry->d_name.len);
- dent->name[new_dentry->d_name.len] = '\0';
+ dent->nlen = cpu_to_le16(fname_len(new_nm));
+ memcpy(dent->name, fname_name(new_nm), fname_len(new_nm));
+ dent->name[fname_len(new_nm)] = '\0';
+ set_dent_cookie(c, dent);
zero_dent_node_unused(dent);
ubifs_prep_grp_node(c, dent, dlen1, 0);
dent2 = (void *)dent + aligned_dlen1;
dent2->ch.node_type = UBIFS_DENT_NODE;
- dent_key_init_flash(c, &dent2->key, old_dir->i_ino,
- &old_dentry->d_name);
+ dent_key_init_flash(c, &dent2->key, old_dir->i_ino, old_nm);
if (whiteout) {
dent2->inum = cpu_to_le64(whiteout->i_ino);
@@ -1135,9 +1149,10 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
dent2->inum = 0;
dent2->type = DT_UNKNOWN;
}
- dent2->nlen = cpu_to_le16(old_dentry->d_name.len);
- memcpy(dent2->name, old_dentry->d_name.name, old_dentry->d_name.len);
- dent2->name[old_dentry->d_name.len] = '\0';
+ dent2->nlen = cpu_to_le16(fname_len(old_nm));
+ memcpy(dent2->name, fname_name(old_nm), fname_len(old_nm));
+ dent2->name[fname_len(old_nm)] = '\0';
+ set_dent_cookie(c, dent2);
zero_dent_node_unused(dent2);
ubifs_prep_grp_node(c, dent2, dlen2, 0);
@@ -1178,15 +1193,15 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
}
release_head(c, BASEHD);
- dent_key_init(c, &key, new_dir->i_ino, &new_dentry->d_name);
- err = ubifs_tnc_add_nm(c, &key, lnum, offs, dlen1, &new_dentry->d_name);
+ dent_key_init(c, &key, new_dir->i_ino, new_nm);
+ err = ubifs_tnc_add_nm(c, &key, lnum, offs, dlen1, new_nm);
if (err)
goto out_ro;
offs += aligned_dlen1;
if (whiteout) {
- dent_key_init(c, &key, old_dir->i_ino, &old_dentry->d_name);
- err = ubifs_tnc_add_nm(c, &key, lnum, offs, dlen2, &old_dentry->d_name);
+ dent_key_init(c, &key, old_dir->i_ino, old_nm);
+ err = ubifs_tnc_add_nm(c, &key, lnum, offs, dlen2, old_nm);
if (err)
goto out_ro;
@@ -1196,8 +1211,8 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
if (err)
goto out_ro;
- dent_key_init(c, &key, old_dir->i_ino, &old_dentry->d_name);
- err = ubifs_tnc_remove_nm(c, &key, &old_dentry->d_name);
+ dent_key_init(c, &key, old_dir->i_ino, old_nm);
+ err = ubifs_tnc_remove_nm(c, &key, old_nm);
if (err)
goto out_ro;
}
@@ -1251,31 +1266,55 @@ out_free:
}
/**
- * recomp_data_node - re-compress a truncated data node.
+ * truncate_data_node - re-compress/encrypt a truncated data node.
+ * @c: UBIFS file-system description object
+ * @inode: inode which referes to the data node
+ * @block: data block number
* @dn: data node to re-compress
* @new_len: new length
*
* This function is used when an inode is truncated and the last data node of
- * the inode has to be re-compressed and re-written.
+ * the inode has to be re-compressed/encrypted and re-written.
*/
-static int recomp_data_node(const struct ubifs_info *c,
- struct ubifs_data_node *dn, int *new_len)
+static int truncate_data_node(const struct ubifs_info *c, const struct inode *inode,
+ unsigned int block, struct ubifs_data_node *dn,
+ int *new_len)
{
void *buf;
- int err, len, compr_type, out_len;
+ int err, dlen, compr_type, out_len, old_dlen;
out_len = le32_to_cpu(dn->size);
buf = kmalloc(out_len * WORST_COMPR_FACTOR, GFP_NOFS);
if (!buf)
return -ENOMEM;
- len = le32_to_cpu(dn->ch.len) - UBIFS_DATA_NODE_SZ;
+ dlen = old_dlen = le32_to_cpu(dn->ch.len) - UBIFS_DATA_NODE_SZ;
compr_type = le16_to_cpu(dn->compr_type);
- err = ubifs_decompress(c, &dn->data, len, buf, &out_len, compr_type);
- if (err)
- goto out;
- ubifs_compress(c, buf, *new_len, &dn->data, &out_len, &compr_type);
+ if (ubifs_crypt_is_encrypted(inode)) {
+ err = ubifs_decrypt(inode, dn, &dlen, block);
+ if (err)
+ goto out;
+ }
+
+ if (compr_type != UBIFS_COMPR_NONE) {
+ err = ubifs_decompress(c, &dn->data, dlen, buf, &out_len, compr_type);
+ if (err)
+ goto out;
+
+ ubifs_compress(c, buf, *new_len, &dn->data, &out_len, &compr_type);
+ }
+
+ if (ubifs_crypt_is_encrypted(inode)) {
+ err = ubifs_encrypt(inode, dn, out_len, &old_dlen, block);
+ if (err)
+ goto out;
+
+ out_len = old_dlen;
+ } else {
+ dn->compr_size = 0;
+ }
+
ubifs_assert(out_len <= UBIFS_BLOCK_SIZE);
dn->compr_type = cpu_to_le16(compr_type);
dn->size = cpu_to_le32(*new_len);
@@ -1347,17 +1386,9 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode,
if (le32_to_cpu(dn->size) <= dlen)
dlen = 0; /* Nothing to do */
else {
- int compr_type = le16_to_cpu(dn->compr_type);
-
- if (compr_type != UBIFS_COMPR_NONE) {
- err = recomp_data_node(c, dn, &dlen);
- if (err)
- goto out_free;
- } else {
- dn->size = cpu_to_le32(dlen);
- dlen += UBIFS_DATA_NODE_SZ;
- }
- zero_data_node_unused(dn);
+ err = truncate_data_node(c, inode, blk, dn, &dlen);
+ if (err)
+ goto out_free;
}
}
}
@@ -1442,7 +1473,8 @@ out_free:
* error code in case of failure.
*/
int ubifs_jnl_delete_xattr(struct ubifs_info *c, const struct inode *host,
- const struct inode *inode, const struct qstr *nm)
+ const struct inode *inode,
+ const struct fscrypt_name *nm)
{
int err, xlen, hlen, len, lnum, xent_offs, aligned_xlen;
struct ubifs_dent_node *xent;
@@ -1451,9 +1483,9 @@ int ubifs_jnl_delete_xattr(struct ubifs_info *c, const struct inode *host,
int sync = IS_DIRSYNC(host);
struct ubifs_inode *host_ui = ubifs_inode(host);
- dbg_jnl("host %lu, xattr ino %lu, name '%s', data len %d",
- host->i_ino, inode->i_ino, nm->name,
- ubifs_inode(inode)->data_len);
+ //dbg_jnl("host %lu, xattr ino %lu, name '%s', data len %d",
+ // host->i_ino, inode->i_ino, nm->name,
+ // ubifs_inode(inode)->data_len);
ubifs_assert(inode->i_nlink == 0);
ubifs_assert(mutex_is_locked(&host_ui->ui_mutex));
@@ -1461,7 +1493,7 @@ int ubifs_jnl_delete_xattr(struct ubifs_info *c, const struct inode *host,
* Since we are deleting the inode, we do not bother to attach any data
* to it and assume its length is %UBIFS_INO_NODE_SZ.
*/
- xlen = UBIFS_DENT_NODE_SZ + nm->len + 1;
+ xlen = UBIFS_DENT_NODE_SZ + fname_len(nm) + 1;
aligned_xlen = ALIGN(xlen, 8);
hlen = host_ui->data_len + UBIFS_INO_NODE_SZ;
len = aligned_xlen + UBIFS_INO_NODE_SZ + ALIGN(hlen, 8);
@@ -1482,9 +1514,9 @@ int ubifs_jnl_delete_xattr(struct ubifs_info *c, const struct inode *host,
key_write(c, &xent_key, xent->key);
xent->inum = 0;
xent->type = get_dent_type(inode->i_mode);
- xent->nlen = cpu_to_le16(nm->len);
- memcpy(xent->name, nm->name, nm->len);
- xent->name[nm->len] = '\0';
+ xent->nlen = cpu_to_le16(fname_len(nm));
+ memcpy(xent->name, fname_name(nm), fname_len(nm));
+ xent->name[fname_len(nm)] = '\0';
zero_dent_node_unused(xent);
ubifs_prep_grp_node(c, xent, xlen, 0);
diff --git a/fs/ubifs/key.h b/fs/ubifs/key.h
index c0a95e393347..7547be512db2 100644
--- a/fs/ubifs/key.h
+++ b/fs/ubifs/key.h
@@ -69,7 +69,7 @@ static inline uint32_t key_r5_hash(const char *s, int len)
uint32_t a = 0;
const signed char *str = (const signed char *)s;
- while (*str) {
+ while (len--) {
a += *str << 4;
a += *str >> 4;
a *= 11;
@@ -153,13 +153,13 @@ static inline void highest_ino_key(const struct ubifs_info *c,
* @c: UBIFS file-system description object
* @key: key to initialize
* @inum: parent inode number
- * @nm: direntry name and length
+ * @nm: direntry name and length. Not a string when encrypted!
*/
static inline void dent_key_init(const struct ubifs_info *c,
union ubifs_key *key, ino_t inum,
- const struct qstr *nm)
+ const struct fscrypt_name *nm)
{
- uint32_t hash = c->key_hash(nm->name, nm->len);
+ uint32_t hash = c->key_hash(fname_name(nm), fname_len(nm));
ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK));
key->u32[0] = inum;
@@ -191,10 +191,11 @@ static inline void dent_key_init_hash(const struct ubifs_info *c,
* @nm: direntry name and length
*/
static inline void dent_key_init_flash(const struct ubifs_info *c, void *k,
- ino_t inum, const struct qstr *nm)
+ ino_t inum,
+ const struct fscrypt_name *nm)
{
union ubifs_key *key = k;
- uint32_t hash = c->key_hash(nm->name, nm->len);
+ uint32_t hash = c->key_hash(fname_name(nm), fname_len(nm));
ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK));
key->j32[0] = cpu_to_le32(inum);
@@ -225,9 +226,9 @@ static inline void lowest_dent_key(const struct ubifs_info *c,
*/
static inline void xent_key_init(const struct ubifs_info *c,
union ubifs_key *key, ino_t inum,
- const struct qstr *nm)
+ const struct fscrypt_name *nm)
{
- uint32_t hash = c->key_hash(nm->name, nm->len);
+ uint32_t hash = c->key_hash(fname_name(nm), fname_len(nm));
ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK));
key->u32[0] = inum;
@@ -242,10 +243,10 @@ static inline void xent_key_init(const struct ubifs_info *c,
* @nm: extended attribute entry name and length
*/
static inline void xent_key_init_flash(const struct ubifs_info *c, void *k,
- ino_t inum, const struct qstr *nm)
+ ino_t inum, const struct fscrypt_name *nm)
{
union ubifs_key *key = k;
- uint32_t hash = c->key_hash(nm->name, nm->len);
+ uint32_t hash = c->key_hash(fname_name(nm), fname_len(nm));
ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK));
key->j32[0] = cpu_to_le32(inum);
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index fb0f44cd1e28..ae5c02f22f3e 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -61,7 +61,7 @@ struct replay_entry {
struct list_head list;
union ubifs_key key;
union {
- struct qstr nm;
+ struct fscrypt_name nm;
struct {
loff_t old_size;
loff_t new_size;
@@ -327,7 +327,7 @@ static void destroy_replay_list(struct ubifs_info *c)
list_for_each_entry_safe(r, tmp, &c->replay_list, list) {
if (is_hash_key(c, &r->key))
- kfree(r->nm.name);
+ kfree(fname_name(&r->nm));
list_del(&r->list);
kfree(r);
}
@@ -430,10 +430,10 @@ static int insert_dent(struct ubifs_info *c, int lnum, int offs, int len,
r->deletion = !!deletion;
r->sqnum = sqnum;
key_copy(c, key, &r->key);
- r->nm.len = nlen;
+ fname_len(&r->nm) = nlen;
memcpy(nbuf, name, nlen);
nbuf[nlen] = '\0';
- r->nm.name = nbuf;
+ fname_name(&r->nm) = nbuf;
list_add_tail(&r->list, &c->replay_list);
return 0;
@@ -456,7 +456,7 @@ int ubifs_validate_entry(struct ubifs_info *c,
if (le32_to_cpu(dent->ch.len) != nlen + UBIFS_DENT_NODE_SZ + 1 ||
dent->type >= UBIFS_ITYPES_CNT ||
nlen > UBIFS_MAX_NLEN || dent->name[nlen] != 0 ||
- strnlen(dent->name, nlen) != nlen ||
+ (key_type == UBIFS_XENT_KEY && strnlen(dent->name, nlen) != nlen) ||
le64_to_cpu(dent->inum) > MAX_INUM) {
ubifs_err(c, "bad %s node", key_type == UBIFS_DENT_KEY ?
"directory entry" : "extended attribute entry");
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
index 3cbb904a6d7d..7f1ead29e727 100644
--- a/fs/ubifs/sb.c
+++ b/fs/ubifs/sb.c
@@ -163,6 +163,7 @@ static int create_default_filesystem(struct ubifs_info *c)
tmp64 = (long long)max_buds * c->leb_size;
if (big_lpt)
sup_flags |= UBIFS_FLG_BIGLPT;
+ sup_flags |= UBIFS_FLG_DOUBLE_HASH;
sup->ch.node_type = UBIFS_SB_NODE;
sup->key_hash = UBIFS_KEY_HASH_R5;
@@ -465,6 +466,16 @@ static int validate_sb(struct ubifs_info *c, struct ubifs_sb_node *sup)
goto failed;
}
+ if (!c->double_hash && c->fmt_version >= 5) {
+ err = 16;
+ goto failed;
+ }
+
+ if (c->encrypted && c->fmt_version < 5) {
+ err = 17;
+ goto failed;
+ }
+
return 0;
failed:
@@ -620,6 +631,24 @@ int ubifs_read_superblock(struct ubifs_info *c)
memcpy(&c->uuid, &sup->uuid, 16);
c->big_lpt = !!(sup_flags & UBIFS_FLG_BIGLPT);
c->space_fixup = !!(sup_flags & UBIFS_FLG_SPACE_FIXUP);
+ c->double_hash = !!(sup_flags & UBIFS_FLG_DOUBLE_HASH);
+ c->encrypted = !!(sup_flags & UBIFS_FLG_ENCRYPTION);
+
+ if ((sup_flags & ~UBIFS_FLG_MASK) != 0) {
+ ubifs_err(c, "Unknown feature flags found: %#x",
+ sup_flags & ~UBIFS_FLG_MASK);
+ err = -EINVAL;
+ goto out;
+ }
+
+#ifndef CONFIG_UBIFS_FS_ENCRYPTION
+ if (c->encrypted) {
+ ubifs_err(c, "file system contains encrypted files but UBIFS"
+ " was built without crypto support.");
+ err = -EINVAL;
+ goto out;
+ }
+#endif
/* Automatically increase file system size to the maximum size */
c->old_leb_cnt = c->leb_cnt;
@@ -807,3 +836,33 @@ int ubifs_fixup_free_space(struct ubifs_info *c)
ubifs_msg(c, "free space fixup complete");
return err;
}
+
+int ubifs_enable_encryption(struct ubifs_info *c)
+{
+ int err;
+ struct ubifs_sb_node *sup;
+
+ if (c->encrypted)
+ return 0;
+
+ if (c->ro_mount || c->ro_media)
+ return -EROFS;
+
+ if (c->fmt_version < 5) {
+ ubifs_err(c, "on-flash format version 5 is needed for encryption");
+ return -EINVAL;
+ }
+
+ sup = ubifs_read_sb_node(c);
+ if (IS_ERR(sup))
+ return PTR_ERR(sup);
+
+ sup->flags |= cpu_to_le32(UBIFS_FLG_ENCRYPTION);
+
+ err = ubifs_write_sb_node(c, sup);
+ if (!err)
+ c->encrypted = 1;
+ kfree(sup);
+
+ return err;
+}
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 4ec051089186..e08aa04fc835 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -198,7 +198,6 @@ struct inode *ubifs_iget(struct super_block *sb, unsigned long inum)
}
memcpy(ui->data, ino->data, ui->data_len);
((char *)ui->data)[ui->data_len] = '\0';
- inode->i_link = ui->data;
break;
case S_IFBLK:
case S_IFCHR:
@@ -380,6 +379,9 @@ out:
}
done:
clear_inode(inode);
+#ifdef CONFIG_UBIFS_FS_ENCRYPTION
+ fscrypt_put_encryption_info(inode, NULL);
+#endif
}
static void ubifs_dirty_inode(struct inode *inode, int flags)
@@ -1207,7 +1209,8 @@ static int mount_ubifs(struct ubifs_info *c)
bu_init(c);
if (!c->ro_mount) {
- c->write_reserve_buf = kmalloc(COMPRESSED_DATA_NODE_BUF_SZ,
+ c->write_reserve_buf = kmalloc(COMPRESSED_DATA_NODE_BUF_SZ + \
+ UBIFS_CIPHER_BLOCK_SIZE,
GFP_KERNEL);
if (!c->write_reserve_buf)
goto out_free;
@@ -1620,7 +1623,8 @@ static int ubifs_remount_rw(struct ubifs_info *c)
goto out;
}
- c->write_reserve_buf = kmalloc(COMPRESSED_DATA_NODE_BUF_SZ, GFP_KERNEL);
+ c->write_reserve_buf = kmalloc(COMPRESSED_DATA_NODE_BUF_SZ + \
+ UBIFS_CIPHER_BLOCK_SIZE, GFP_KERNEL);
if (!c->write_reserve_buf) {
err = -ENOMEM;
goto out;
@@ -1995,6 +1999,12 @@ static struct ubifs_info *alloc_ubifs_info(struct ubi_volume_desc *ubi)
return c;
}
+#ifndef CONFIG_UBIFS_FS_ENCRYPTION
+struct fscrypt_operations ubifs_crypt_operations = {
+ .is_encrypted = __ubifs_crypt_is_encrypted,
+};
+#endif
+
static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
{
struct ubifs_info *c = sb->s_fs_info;
@@ -2041,6 +2051,7 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
sb->s_maxbytes = c->max_inode_sz = MAX_LFS_FILESIZE;
sb->s_op = &ubifs_super_operations;
sb->s_xattr = ubifs_xattr_handlers;
+ sb->s_cop = &ubifs_crypt_operations;
mutex_lock(&c->umount_mutex);
err = mount_ubifs(c);
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index fa9a20cc60d6..74ae2de949df 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -378,7 +378,7 @@ static void lnc_free(struct ubifs_zbranch *zbr)
}
/**
- * tnc_read_node_nm - read a "hashed" leaf node.
+ * tnc_read_hashed_node - read a "hashed" leaf node.
* @c: UBIFS file-system description object
* @zbr: key and position of the node
* @node: node is returned here
@@ -388,8 +388,8 @@ static void lnc_free(struct ubifs_zbranch *zbr)
* added to LNC. Returns zero in case of success or a negative negative error
* code in case of failure.
*/
-static int tnc_read_node_nm(struct ubifs_info *c, struct ubifs_zbranch *zbr,
- void *node)
+static int tnc_read_hashed_node(struct ubifs_info *c, struct ubifs_zbranch *zbr,
+ void *node)
{
int err;
@@ -519,7 +519,7 @@ static int fallible_read_node(struct ubifs_info *c, const union ubifs_key *key,
* of failure, a negative error code is returned.
*/
static int matches_name(struct ubifs_info *c, struct ubifs_zbranch *zbr,
- const struct qstr *nm)
+ const struct fscrypt_name *nm)
{
struct ubifs_dent_node *dent;
int nlen, err;
@@ -542,11 +542,11 @@ static int matches_name(struct ubifs_info *c, struct ubifs_zbranch *zbr,
dent = zbr->leaf;
nlen = le16_to_cpu(dent->nlen);
- err = memcmp(dent->name, nm->name, min_t(int, nlen, nm->len));
+ err = memcmp(dent->name, fname_name(nm), min_t(int, nlen, fname_len(nm)));
if (err == 0) {
- if (nlen == nm->len)
+ if (nlen == fname_len(nm))
return NAME_MATCHES;
- else if (nlen < nm->len)
+ else if (nlen < fname_len(nm))
return NAME_LESS;
else
return NAME_GREATER;
@@ -689,7 +689,7 @@ static int tnc_prev(struct ubifs_info *c, struct ubifs_znode **zn, int *n)
*/
static int resolve_collision(struct ubifs_info *c, const union ubifs_key *key,
struct ubifs_znode **zn, int *n,
- const struct qstr *nm)
+ const struct fscrypt_name *nm)
{
int err;
@@ -807,7 +807,7 @@ static int resolve_collision(struct ubifs_info *c, const union ubifs_key *key,
*/
static int fallible_matches_name(struct ubifs_info *c,
struct ubifs_zbranch *zbr,
- const struct qstr *nm)
+ const struct fscrypt_name *nm)
{
struct ubifs_dent_node *dent;
int nlen, err;
@@ -835,11 +835,11 @@ static int fallible_matches_name(struct ubifs_info *c,
dent = zbr->leaf;
nlen = le16_to_cpu(dent->nlen);
- err = memcmp(dent->name, nm->name, min_t(int, nlen, nm->len));
+ err = memcmp(dent->name, fname_name(nm), min_t(int, nlen, fname_len(nm)));
if (err == 0) {
- if (nlen == nm->len)
+ if (nlen == fname_len(nm))
return NAME_MATCHES;
- else if (nlen < nm->len)
+ else if (nlen < fname_len(nm))
return NAME_LESS;
else
return NAME_GREATER;
@@ -878,7 +878,8 @@ out_free:
static int fallible_resolve_collision(struct ubifs_info *c,
const union ubifs_key *key,
struct ubifs_znode **zn, int *n,
- const struct qstr *nm, int adding)
+ const struct fscrypt_name *nm,
+ int adding)
{
struct ubifs_znode *o_znode = NULL, *znode = *zn;
int uninitialized_var(o_n), err, cmp, unsure = 0, nn = *n;
@@ -1453,7 +1454,7 @@ again:
* In this case the leaf node cache gets used, so we pass the
* address of the zbranch and keep the mutex locked
*/
- err = tnc_read_node_nm(c, zt, node);
+ err = tnc_read_hashed_node(c, zt, node);
goto out;
}
if (safely) {
@@ -1782,19 +1783,19 @@ int ubifs_tnc_bulk_read(struct ubifs_info *c, struct bu_info *bu)
* @node: the node is returned here
* @nm: node name
*
- * This function look up and reads a node which contains name hash in the key.
+ * This function looks up and reads a node which contains name hash in the key.
* Since the hash may have collisions, there may be many nodes with the same
* key, so we have to sequentially look to all of them until the needed one is
* found. This function returns zero in case of success, %-ENOENT if the node
* was not found, and a negative error code in case of failure.
*/
static int do_lookup_nm(struct ubifs_info *c, const union ubifs_key *key,
- void *node, const struct qstr *nm)
+ void *node, const struct fscrypt_name *nm)
{
int found, n, err;
struct ubifs_znode *znode;
- dbg_tnck(key, "name '%.*s' key ", nm->len, nm->name);
+ //dbg_tnck(key, "name '%.*s' key ", nm->len, nm->name);
mutex_lock(&c->tnc_mutex);
found = ubifs_lookup_level0(c, key, &znode, &n);
if (!found) {
@@ -1816,7 +1817,7 @@ static int do_lookup_nm(struct ubifs_info *c, const union ubifs_key *key,
goto out_unlock;
}
- err = tnc_read_node_nm(c, &znode->zbranch[n], node);
+ err = tnc_read_hashed_node(c, &znode->zbranch[n], node);
out_unlock:
mutex_unlock(&c->tnc_mutex);
@@ -1830,14 +1831,14 @@ out_unlock:
* @node: the node is returned here
* @nm: node name
*
- * This function look up and reads a node which contains name hash in the key.
+ * This function looks up and reads a node which contains name hash in the key.
* Since the hash may have collisions, there may be many nodes with the same
* key, so we have to sequentially look to all of them until the needed one is
* found. This function returns zero in case of success, %-ENOENT if the node
* was not found, and a negative error code in case of failure.
*/
int ubifs_tnc_lookup_nm(struct ubifs_info *c, const union ubifs_key *key,
- void *node, const struct qstr *nm)
+ void *node, const struct fscrypt_name *nm)
{
int err, len;
const struct ubifs_dent_node *dent = node;
@@ -1851,16 +1852,105 @@ int ubifs_tnc_lookup_nm(struct ubifs_info *c, const union ubifs_key *key,
return err;
len = le16_to_cpu(dent->nlen);
- if (nm->len == len && !memcmp(dent->name, nm->name, len))
+ if (fname_len(nm) == len && !memcmp(dent->name, fname_name(nm), len))
return 0;
/*
* Unluckily, there are hash collisions and we have to iterate over
* them look at each direntry with colliding name hash sequentially.
*/
+
return do_lookup_nm(c, key, node, nm);
}
+static int do_lookup_dh(struct ubifs_info *c, const union ubifs_key *key,
+ struct ubifs_dent_node *dent, uint32_t cookie)
+{
+ int n, err, type = key_type(c, key);
+ struct ubifs_znode *znode;
+ struct ubifs_zbranch *zbr;
+ union ubifs_key *dkey, start_key;
+
+ ubifs_assert(is_hash_key(c, key));
+
+ lowest_dent_key(c, &start_key, key_inum(c, key));
+
+ mutex_lock(&c->tnc_mutex);
+ err = ubifs_lookup_level0(c, &start_key, &znode, &n);
+ if (unlikely(err < 0))
+ goto out_unlock;
+
+ for (;;) {
+ if (!err) {
+ err = tnc_next(c, &znode, &n);
+ if (err)
+ goto out_unlock;
+ }
+
+ zbr = &znode->zbranch[n];
+ dkey = &zbr->key;
+
+ if (key_inum(c, dkey) != key_inum(c, key) ||
+ key_type(c, dkey) != type) {
+ err = -ENOENT;
+ goto out_unlock;
+ }
+
+ err = tnc_read_hashed_node(c, zbr, dent);
+ if (err)
+ goto out_unlock;
+
+ if (key_hash(c, key) == key_hash(c, dkey) &&
+ le32_to_cpu(dent->cookie) == cookie)
+ goto out_unlock;
+ }
+
+out_unlock:
+ mutex_unlock(&c->tnc_mutex);
+ return err;
+}
+
+/**
+ * ubifs_tnc_lookup_dh - look up a "double hashed" node.
+ * @c: UBIFS file-system description object
+ * @key: node key to lookup
+ * @node: the node is returned here
+ * @cookie: node cookie for collision resolution
+ *
+ * This function looks up and reads a node which contains name hash in the key.
+ * Since the hash may have collisions, there may be many nodes with the same
+ * key, so we have to sequentially look to all of them until the needed one
+ * with the same cookie value is found.
+ * This function returns zero in case of success, %-ENOENT if the node
+ * was not found, and a negative error code in case of failure.
+ */
+int ubifs_tnc_lookup_dh(struct ubifs_info *c, const union ubifs_key *key,
+ void *node, uint32_t cookie)
+{
+ int err;
+ const struct ubifs_dent_node *dent = node;
+
+ if (!c->double_hash)
+ return -EOPNOTSUPP;
+
+ /*
+ * We assume that in most of the cases there are no name collisions and
+ * 'ubifs_tnc_lookup()' returns us the right direntry.
+ */
+ err = ubifs_tnc_lookup(c, key, node);
+ if (err)
+ return err;
+
+ if (le32_to_cpu(dent->cookie) == cookie)
+ return 0;
+
+ /*
+ * Unluckily, there are hash collisions and we have to iterate over
+ * them look at each direntry with colliding name hash sequentially.
+ */
+ return do_lookup_dh(c, key, node, cookie);
+}
+
/**
* correct_parent_keys - correct parent znodes' keys.
* @c: UBIFS file-system description object
@@ -2279,14 +2369,15 @@ out_unlock:
* may have collisions, like directory entry keys.
*/
int ubifs_tnc_add_nm(struct ubifs_info *c, const union ubifs_key *key,
- int lnum, int offs, int len, const struct qstr *nm)
+ int lnum, int offs, int len,
+ const struct fscrypt_name *nm)
{
int found, n, err = 0;
struct ubifs_znode *znode;
mutex_lock(&c->tnc_mutex);
- dbg_tnck(key, "LEB %d:%d, name '%.*s', key ",
- lnum, offs, nm->len, nm->name);
+ //dbg_tnck(key, "LEB %d:%d, name '%.*s', key ",
+ // lnum, offs, nm->len, nm->name);
found = lookup_level0_dirty(c, key, &znode, &n);
if (found < 0) {
err = found;
@@ -2344,7 +2435,7 @@ int ubifs_tnc_add_nm(struct ubifs_info *c, const union ubifs_key *key,
* by passing 'ubifs_tnc_remove_nm()' the same key but
* an unmatchable name.
*/
- struct qstr noname = { .name = "" };
+ struct fscrypt_name noname = { .disk_name = { .name = "", .len = 1 } };
err = dbg_check_tnc(c, 0);
mutex_unlock(&c->tnc_mutex);
@@ -2514,13 +2605,13 @@ out_unlock:
* Returns %0 on success or negative error code on failure.
*/
int ubifs_tnc_remove_nm(struct ubifs_info *c, const union ubifs_key *key,
- const struct qstr *nm)
+ const struct fscrypt_name *nm)
{
int n, err;
struct ubifs_znode *znode;
mutex_lock(&c->tnc_mutex);
- dbg_tnck(key, "%.*s, key ", nm->len, nm->name);
+ //dbg_tnck(key, "%.*s, key ", nm->len, nm->name);
err = lookup_level0_dirty(c, key, &znode, &n);
if (err < 0)
goto out_unlock;
@@ -2669,7 +2760,7 @@ int ubifs_tnc_remove_ino(struct ubifs_info *c, ino_t inum)
{
union ubifs_key key1, key2;
struct ubifs_dent_node *xent, *pxent = NULL;
- struct qstr nm = { .name = NULL };
+ struct fscrypt_name nm = {0};
dbg_tnc("ino %lu", (unsigned long)inum);
@@ -2694,8 +2785,8 @@ int ubifs_tnc_remove_ino(struct ubifs_info *c, ino_t inum)
dbg_tnc("xent '%s', ino %lu", xent->name,
(unsigned long)xattr_inum);
- nm.name = xent->name;
- nm.len = le16_to_cpu(xent->nlen);
+ fname_name(&nm) = xent->name;
+ fname_len(&nm) = le16_to_cpu(xent->nlen);
err = ubifs_tnc_remove_nm(c, &key1, &nm);
if (err) {
kfree(xent);
@@ -2747,7 +2838,7 @@ int ubifs_tnc_remove_ino(struct ubifs_info *c, ino_t inum)
*/
struct ubifs_dent_node *ubifs_tnc_next_ent(struct ubifs_info *c,
union ubifs_key *key,
- const struct qstr *nm)
+ const struct fscrypt_name *nm)
{
int n, err, type = key_type(c, key);
struct ubifs_znode *znode;
@@ -2755,7 +2846,7 @@ struct ubifs_dent_node *ubifs_tnc_next_ent(struct ubifs_info *c,
struct ubifs_zbranch *zbr;
union ubifs_key *dkey;
- dbg_tnck(key, "%s ", nm->name ? (char *)nm->name : "(lowest)");
+ //dbg_tnck(key, "%s ", nm->name ? (char *)nm->name : "(lowest)");
ubifs_assert(is_hash_key(c, key));
mutex_lock(&c->tnc_mutex);
@@ -2763,7 +2854,7 @@ struct ubifs_dent_node *ubifs_tnc_next_ent(struct ubifs_info *c,
if (unlikely(err < 0))
goto out_unlock;
- if (nm->name) {
+ if (fname_len(nm) > 0) {
if (err) {
/* Handle collisions */
err = resolve_collision(c, key, &znode, &n, nm);
@@ -2813,7 +2904,7 @@ struct ubifs_dent_node *ubifs_tnc_next_ent(struct ubifs_info *c,
goto out_free;
}
- err = tnc_read_node_nm(c, zbr, dent);
+ err = tnc_read_hashed_node(c, zbr, dent);
if (unlikely(err))
goto out_free;
diff --git a/fs/ubifs/ubifs-media.h b/fs/ubifs/ubifs-media.h
index e24380cf46ed..e8c23c9d4f4a 100644
--- a/fs/ubifs/ubifs-media.h
+++ b/fs/ubifs/ubifs-media.h
@@ -46,7 +46,7 @@
* UBIFS went into mainline kernel with format version 4. The older formats
* were development formats.
*/
-#define UBIFS_FORMAT_VERSION 4
+#define UBIFS_FORMAT_VERSION 5
/*
* Read-only compatibility version. If the UBIFS format is changed, older UBIFS
@@ -301,6 +301,13 @@ enum {
#define UBIFS_MAX_NODE_SZ UBIFS_MAX_INO_NODE_SZ
/*
+ * xattr name of UBIFS encryption context, we don't use a prefix
+ * nor a long name to not waste space on the flash.
+ */
+#define UBIFS_XATTR_NAME_ENCRYPTION_CONTEXT "c"
+
+
+/*
* On-flash inode flags.
*
* UBIFS_COMPR_FL: use compression for this inode
@@ -309,6 +316,7 @@ enum {
* UBIFS_APPEND_FL: writes to the inode may only append data
* UBIFS_DIRSYNC_FL: I/O on this directory inode has to be synchronous
* UBIFS_XATTR_FL: this inode is the inode for an extended attribute value
+ * UBIFS_CRYPT_FL: use encryption for this inode
*
* Note, these are on-flash flags which correspond to ioctl flags
* (@FS_COMPR_FL, etc). They have the same values now, but generally, do not
@@ -321,6 +329,7 @@ enum {
UBIFS_APPEND_FL = 0x08,
UBIFS_DIRSYNC_FL = 0x10,
UBIFS_XATTR_FL = 0x20,
+ UBIFS_CRYPT_FL = 0x40,
};
/* Inode flag bits used by UBIFS */
@@ -409,12 +418,19 @@ enum {
*
* UBIFS_FLG_BIGLPT: if "big" LPT model is used if set
* UBIFS_FLG_SPACE_FIXUP: first-mount "fixup" of free space within LEBs needed
+ * UBIFS_FLG_DOUBLE_HASH: store a 32bit cookie in directory entry nodes to
+ * support 64bit cookies for lookups by hash
+ * UBIFS_FLG_ENCRYPTION: this filesystem contains encrypted files
*/
enum {
UBIFS_FLG_BIGLPT = 0x02,
UBIFS_FLG_SPACE_FIXUP = 0x04,
+ UBIFS_FLG_DOUBLE_HASH = 0x08,
+ UBIFS_FLG_ENCRYPTION = 0x10,
};
+#define UBIFS_FLG_MASK (UBIFS_FLG_BIGLPT|UBIFS_FLG_SPACE_FIXUP|UBIFS_FLG_DOUBLE_HASH|UBIFS_FLG_ENCRYPTION)
+
/**
* struct ubifs_ch - common header node.
* @magic: UBIFS node magic number (%UBIFS_NODE_MAGIC)
@@ -521,7 +537,8 @@ struct ubifs_ino_node {
* @padding1: reserved for future, zeroes
* @type: type of the target inode (%UBIFS_ITYPE_REG, %UBIFS_ITYPE_DIR, etc)
* @nlen: name length
- * @padding2: reserved for future, zeroes
+ * @cookie: A 32bits random number, used to construct a 64bits
+ * identifier.
* @name: zero-terminated name
*
* Note, do not forget to amend 'zero_dent_node_unused()' function when
@@ -534,7 +551,7 @@ struct ubifs_dent_node {
__u8 padding1;
__u8 type;
__le16 nlen;
- __u8 padding2[4]; /* Watch 'zero_dent_node_unused()' if changing! */
+ __le32 cookie;
__u8 name[];
} __packed;
@@ -544,18 +561,16 @@ struct ubifs_dent_node {
* @key: node key
* @size: uncompressed data size in bytes
* @compr_type: compression type (%UBIFS_COMPR_NONE, %UBIFS_COMPR_LZO, etc)
- * @padding: reserved for future, zeroes
+ * @compr_size: compressed data size in bytes, only valid when data is encrypted
* @data: data
*
- * Note, do not forget to amend 'zero_data_node_unused()' function when
- * changing the padding fields.
*/
struct ubifs_data_node {
struct ubifs_ch ch;
__u8 key[UBIFS_MAX_KEY_LEN];
__le32 size;
__le16 compr_type;
- __u8 padding[2]; /* Watch 'zero_data_node_unused()' if changing! */
+ __le16 compr_size;
__u8 data[];
} __packed;
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 096035eb29d0..ca72382ce6cc 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -38,6 +38,8 @@
#include <linux/backing-dev.h>
#include <linux/security.h>
#include <linux/xattr.h>
+#include <linux/fscrypto.h>
+#include <linux/random.h>
#include "ubifs-media.h"
/* Version of this UBIFS implementation */
@@ -83,10 +85,6 @@
*/
#define BGT_NAME_PATTERN "ubifs_bgt%d_%d"
-/* Write-buffer synchronization timeout interval in seconds */
-#define WBUF_TIMEOUT_SOFTLIMIT 3
-#define WBUF_TIMEOUT_HARDLIMIT 5
-
/* Maximum possible inode number (only 32-bit inodes are supported now) */
#define MAX_INUM 0xFFFFFFFF
@@ -138,6 +136,12 @@
*/
#define WORST_COMPR_FACTOR 2
+#ifdef CONFIG_UBIFS_FS_ENCRYPTION
+#define UBIFS_CIPHER_BLOCK_SIZE FS_CRYPTO_BLOCK_SIZE
+#else
+#define UBIFS_CIPHER_BLOCK_SIZE 0
+#endif
+
/*
* How much memory is needed for a buffer where we compress a data node.
*/
@@ -645,9 +649,6 @@ typedef int (*ubifs_lpt_scan_callback)(struct ubifs_info *c,
* @io_mutex: serializes write-buffer I/O
* @lock: serializes @buf, @lnum, @offs, @avail, @used, @next_ino and @inodes
* fields
- * @softlimit: soft write-buffer timeout interval
- * @delta: hard and soft timeouts delta (the timer expire interval is @softlimit
- * and @softlimit + @delta)
* @timer: write-buffer timer
* @no_timer: non-zero if this write-buffer does not have a timer
* @need_sync: non-zero if the timer expired and the wbuf needs sync'ing
@@ -676,8 +677,6 @@ struct ubifs_wbuf {
int (*sync_callback)(struct ubifs_info *c, int lnum, int free, int pad);
struct mutex io_mutex;
spinlock_t lock;
- ktime_t softlimit;
- unsigned long long delta;
struct hrtimer timer;
unsigned int no_timer:1;
unsigned int need_sync:1;
@@ -1007,6 +1006,8 @@ struct ubifs_debug_info;
*
* @big_lpt: flag that LPT is too big to write whole during commit
* @space_fixup: flag indicating that free space in LEBs needs to be cleaned up
+ * @double_hash: flag indicating that we can do lookups by hash
+ * @encrypted: flag indicating that this file system contains encrypted files
* @no_chk_data_crc: do not check CRCs when reading data nodes (except during
* recovery)
* @bulk_read: enable bulk-reads
@@ -1249,6 +1250,8 @@ struct ubifs_info {
unsigned int big_lpt:1;
unsigned int space_fixup:1;
+ unsigned int double_hash:1;
+ unsigned int encrypted:1;
unsigned int no_chk_data_crc:1;
unsigned int bulk_read:1;
unsigned int default_compr:2;
@@ -1515,25 +1518,29 @@ int ubifs_consolidate_log(struct ubifs_info *c);
/* journal.c */
int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir,
- const struct qstr *nm, const struct inode *inode,
+ const struct fscrypt_name *nm, const struct inode *inode,
int deletion, int xent);
int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode,
const union ubifs_key *key, const void *buf, int len);
int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode);
int ubifs_jnl_delete_inode(struct ubifs_info *c, const struct inode *inode);
int ubifs_jnl_xrename(struct ubifs_info *c, const struct inode *fst_dir,
- const struct dentry *fst_dentry,
+ const struct inode *fst_inode,
+ const struct fscrypt_name *fst_nm,
const struct inode *snd_dir,
- const struct dentry *snd_dentry, int sync);
+ const struct inode *snd_inode,
+ const struct fscrypt_name *snd_nm, int sync);
int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
- const struct dentry *old_dentry,
+ const struct inode *old_inode,
+ const struct fscrypt_name *old_nm,
const struct inode *new_dir,
- const struct dentry *new_dentry,
+ const struct inode *new_inode,
+ const struct fscrypt_name *new_nm,
const struct inode *whiteout, int sync);
int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode,
loff_t old_size, loff_t new_size);
int ubifs_jnl_delete_xattr(struct ubifs_info *c, const struct inode *host,
- const struct inode *inode, const struct qstr *nm);
+ const struct inode *inode, const struct fscrypt_name *nm);
int ubifs_jnl_change_xattr(struct ubifs_info *c, const struct inode *inode1,
const struct inode *inode2);
@@ -1568,7 +1575,9 @@ int ubifs_save_dirty_idx_lnums(struct ubifs_info *c);
int ubifs_lookup_level0(struct ubifs_info *c, const union ubifs_key *key,
struct ubifs_znode **zn, int *n);
int ubifs_tnc_lookup_nm(struct ubifs_info *c, const union ubifs_key *key,
- void *node, const struct qstr *nm);
+ void *node, const struct fscrypt_name *nm);
+int ubifs_tnc_lookup_dh(struct ubifs_info *c, const union ubifs_key *key,
+ void *node, uint32_t secondary_hash);
int ubifs_tnc_locate(struct ubifs_info *c, const union ubifs_key *key,
void *node, int *lnum, int *offs);
int ubifs_tnc_add(struct ubifs_info *c, const union ubifs_key *key, int lnum,
@@ -1576,16 +1585,16 @@ int ubifs_tnc_add(struct ubifs_info *c, const union ubifs_key *key, int lnum,
int ubifs_tnc_replace(struct ubifs_info *c, const union ubifs_key *key,
int old_lnum, int old_offs, int lnum, int offs, int len);
int ubifs_tnc_add_nm(struct ubifs_info *c, const union ubifs_key *key,
- int lnum, int offs, int len, const struct qstr *nm);
+ int lnum, int offs, int len, const struct fscrypt_name *nm);
int ubifs_tnc_remove(struct ubifs_info *c, const union ubifs_key *key);
int ubifs_tnc_remove_nm(struct ubifs_info *c, const union ubifs_key *key,
- const struct qstr *nm);
+ const struct fscrypt_name *nm);
int ubifs_tnc_remove_range(struct ubifs_info *c, union ubifs_key *from_key,
union ubifs_key *to_key);
int ubifs_tnc_remove_ino(struct ubifs_info *c, ino_t inum);
struct ubifs_dent_node *ubifs_tnc_next_ent(struct ubifs_info *c,
union ubifs_key *key,
- const struct qstr *nm);
+ const struct fscrypt_name *nm);
void ubifs_tnc_close(struct ubifs_info *c);
int ubifs_tnc_has_node(struct ubifs_info *c, union ubifs_key *key, int level,
int lnum, int offs, int is_idx);
@@ -1642,6 +1651,7 @@ int ubifs_read_superblock(struct ubifs_info *c);
struct ubifs_sb_node *ubifs_read_sb_node(struct ubifs_info *c);
int ubifs_write_sb_node(struct ubifs_info *c, struct ubifs_sb_node *sup);
int ubifs_fixup_free_space(struct ubifs_info *c);
+int ubifs_enable_encryption(struct ubifs_info *c);
/* replay.c */
int ubifs_validate_entry(struct ubifs_info *c,
@@ -1733,16 +1743,21 @@ int ubifs_update_time(struct inode *inode, struct timespec *time, int flags);
#endif
/* dir.c */
-struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir,
+struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir,
umode_t mode);
int ubifs_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat);
+int ubifs_check_dir_empty(struct inode *dir);
/* xattr.c */
extern const struct xattr_handler *ubifs_xattr_handlers[];
ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size);
int ubifs_init_security(struct inode *dentry, struct inode *inode,
const struct qstr *qstr);
+int ubifs_xattr_set(struct inode *host, const char *name, const void *value,
+ size_t size, int flags);
+ssize_t ubifs_xattr_get(struct inode *host, const char *name, void *buf,
+ size_t size);
/* super.c */
struct inode *ubifs_iget(struct super_block *sb, unsigned long inum);
@@ -1781,6 +1796,66 @@ int ubifs_decompress(const struct ubifs_info *c, const void *buf, int len,
#include "misc.h"
#include "key.h"
+#ifndef CONFIG_UBIFS_FS_ENCRYPTION
+#define fscrypt_set_d_op(i)
+#define fscrypt_get_ctx fscrypt_notsupp_get_ctx
+#define fscrypt_release_ctx fscrypt_notsupp_release_ctx
+#define fscrypt_encrypt_page fscrypt_notsupp_encrypt_page
+#define fscrypt_decrypt_page fscrypt_notsupp_decrypt_page
+#define fscrypt_decrypt_bio_pages fscrypt_notsupp_decrypt_bio_pages
+#define fscrypt_pullback_bio_page fscrypt_notsupp_pullback_bio_page
+#define fscrypt_restore_control_page fscrypt_notsupp_restore_control_page
+#define fscrypt_zeroout_range fscrypt_notsupp_zeroout_range
+#define fscrypt_ioctl_set_policy fscrypt_notsupp_ioctl_set_policy
+#define fscrypt_ioctl_get_policy fscrypt_notsupp_ioctl_get_policy
+#define fscrypt_has_permitted_context fscrypt_notsupp_has_permitted_context
+#define fscrypt_inherit_context fscrypt_notsupp_inherit_context
+#define fscrypt_get_encryption_info fscrypt_notsupp_get_encryption_info
+#define fscrypt_put_encryption_info fscrypt_notsupp_put_encryption_info
+#define fscrypt_setup_filename fscrypt_notsupp_setup_filename
+#define fscrypt_free_filename fscrypt_notsupp_free_filename
+#define fscrypt_fname_encrypted_size fscrypt_notsupp_fname_encrypted_size
+#define fscrypt_fname_alloc_buffer fscrypt_notsupp_fname_alloc_buffer
+#define fscrypt_fname_free_buffer fscrypt_notsupp_fname_free_buffer
+#define fscrypt_fname_disk_to_usr fscrypt_notsupp_fname_disk_to_usr
+#define fscrypt_fname_usr_to_disk fscrypt_notsupp_fname_usr_to_disk
+static inline int ubifs_encrypt(const struct inode *inode,
+ struct ubifs_data_node *dn,
+ unsigned int in_len, unsigned int *out_len,
+ int block)
+{
+ ubifs_assert(0);
+ return -EOPNOTSUPP;
+}
+static inline int ubifs_decrypt(const struct inode *inode,
+ struct ubifs_data_node *dn,
+ unsigned int *out_len, int block)
+{
+ ubifs_assert(0);
+ return -EOPNOTSUPP;
+}
+#else
+/* crypto.c */
+int ubifs_encrypt(const struct inode *inode, struct ubifs_data_node *dn,
+ unsigned int in_len, unsigned int *out_len, int block);
+int ubifs_decrypt(const struct inode *inode, struct ubifs_data_node *dn,
+ unsigned int *out_len, int block);
+#endif
+
+extern struct fscrypt_operations ubifs_crypt_operations;
+
+static inline bool __ubifs_crypt_is_encrypted(struct inode *inode)
+{
+ struct ubifs_inode *ui = ubifs_inode(inode);
+
+ return ui->flags & UBIFS_CRYPT_FL;
+}
+
+static inline bool ubifs_crypt_is_encrypted(const struct inode *inode)
+{
+ return __ubifs_crypt_is_encrypted((struct inode *)inode);
+}
+
/* Normal UBIFS messages */
__printf(2, 3)
void ubifs_msg(const struct ubifs_info *c, const char *fmt, ...);
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index d9f9615bfd71..efe00fcb8b75 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -97,7 +97,7 @@ static const struct file_operations empty_fops;
* of failure.
*/
static int create_xattr(struct ubifs_info *c, struct inode *host,
- const struct qstr *nm, const void *value, int size)
+ const struct fscrypt_name *nm, const void *value, int size)
{
int err, names_len;
struct inode *inode;
@@ -117,7 +117,7 @@ static int create_xattr(struct ubifs_info *c, struct inode *host,
* extended attributes if the name list becomes larger. This limitation
* is artificial for UBIFS, though.
*/
- names_len = host_ui->xattr_names + host_ui->xattr_cnt + nm->len + 1;
+ names_len = host_ui->xattr_names + host_ui->xattr_cnt + fname_len(nm) + 1;
if (names_len > XATTR_LIST_MAX) {
ubifs_err(c, "cannot add one more xattr name to inode %lu, total names length would become %d, max. is %d",
host->i_ino, names_len, XATTR_LIST_MAX);
@@ -154,9 +154,18 @@ static int create_xattr(struct ubifs_info *c, struct inode *host,
mutex_lock(&host_ui->ui_mutex);
host->i_ctime = ubifs_current_time(host);
host_ui->xattr_cnt += 1;
- host_ui->xattr_size += CALC_DENT_SIZE(nm->len);
+ host_ui->xattr_size += CALC_DENT_SIZE(fname_len(nm));
host_ui->xattr_size += CALC_XATTR_BYTES(size);
- host_ui->xattr_names += nm->len;
+ host_ui->xattr_names += fname_len(nm);
+
+ /*
+ * We handle UBIFS_XATTR_NAME_ENCRYPTION_CONTEXT here because we
+ * have to set the UBIFS_CRYPT_FL flag on the host inode.
+ * To avoid multiple updates of the same inode in the same operation,
+ * let's do it here.
+ */
+ if (strcmp(fname_name(nm), UBIFS_XATTR_NAME_ENCRYPTION_CONTEXT) == 0)
+ host_ui->flags |= UBIFS_CRYPT_FL;
err = ubifs_jnl_update(c, host, nm, inode, 0, 1);
if (err)
@@ -170,9 +179,10 @@ static int create_xattr(struct ubifs_info *c, struct inode *host,
out_cancel:
host_ui->xattr_cnt -= 1;
- host_ui->xattr_size -= CALC_DENT_SIZE(nm->len);
+ host_ui->xattr_size -= CALC_DENT_SIZE(fname_len(nm));
host_ui->xattr_size -= CALC_XATTR_BYTES(size);
- host_ui->xattr_names -= nm->len;
+ host_ui->xattr_names -= fname_len(nm);
+ host_ui->flags &= ~UBIFS_CRYPT_FL;
mutex_unlock(&host_ui->ui_mutex);
out_free:
make_bad_inode(inode);
@@ -269,22 +279,28 @@ static struct inode *iget_xattr(struct ubifs_info *c, ino_t inum)
return ERR_PTR(-EINVAL);
}
-static int __ubifs_setxattr(struct inode *host, const char *name,
- const void *value, size_t size, int flags)
+int ubifs_xattr_set(struct inode *host, const char *name, const void *value,
+ size_t size, int flags)
{
struct inode *inode;
struct ubifs_info *c = host->i_sb->s_fs_info;
- struct qstr nm = QSTR_INIT(name, strlen(name));
+ struct fscrypt_name nm = { .disk_name = FSTR_INIT((char *)name, strlen(name))};
struct ubifs_dent_node *xent;
union ubifs_key key;
int err;
- ubifs_assert(inode_is_locked(host));
+ /*
+ * Creating an encryption context is done unlocked since we
+ * operate on a new inode which is not visible to other users
+ * at this point.
+ */
+ if (strcmp(name, UBIFS_XATTR_NAME_ENCRYPTION_CONTEXT) != 0)
+ ubifs_assert(inode_is_locked(host));
if (size > UBIFS_MAX_INO_DATA)
return -ERANGE;
- if (nm.len > UBIFS_MAX_NLEN)
+ if (fname_len(&nm) > UBIFS_MAX_NLEN)
return -ENAMETOOLONG;
xent = kmalloc(UBIFS_MAX_XENT_NODE_SZ, GFP_NOFS);
@@ -329,18 +345,18 @@ out_free:
return err;
}
-static ssize_t __ubifs_getxattr(struct inode *host, const char *name,
- void *buf, size_t size)
+ssize_t ubifs_xattr_get(struct inode *host, const char *name, void *buf,
+ size_t size)
{
struct inode *inode;
struct ubifs_info *c = host->i_sb->s_fs_info;
- struct qstr nm = QSTR_INIT(name, strlen(name));
+ struct fscrypt_name nm = { .disk_name = FSTR_INIT((char *)name, strlen(name))};
struct ubifs_inode *ui;
struct ubifs_dent_node *xent;
union ubifs_key key;
int err;
- if (nm.len > UBIFS_MAX_NLEN)
+ if (fname_len(&nm) > UBIFS_MAX_NLEN)
return -ENAMETOOLONG;
xent = kmalloc(UBIFS_MAX_XENT_NODE_SZ, GFP_NOFS);
@@ -387,6 +403,20 @@ out_unlock:
return err;
}
+static bool xattr_visible(const char *name)
+{
+ /* File encryption related xattrs are for internal use only */
+ if (strcmp(name, UBIFS_XATTR_NAME_ENCRYPTION_CONTEXT) == 0)
+ return false;
+
+ /* Show trusted namespace only for "power" users */
+ if (strncmp(name, XATTR_TRUSTED_PREFIX,
+ XATTR_TRUSTED_PREFIX_LEN) == 0 && !capable(CAP_SYS_ADMIN))
+ return false;
+
+ return true;
+}
+
ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size)
{
union ubifs_key key;
@@ -395,7 +425,7 @@ ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size)
struct ubifs_inode *host_ui = ubifs_inode(host);
struct ubifs_dent_node *xent, *pxent = NULL;
int err, len, written = 0;
- struct qstr nm = { .name = NULL };
+ struct fscrypt_name nm = {0};
dbg_gen("ino %lu ('%pd'), buffer size %zd", host->i_ino,
dentry, size);
@@ -419,15 +449,12 @@ ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size)
break;
}
- nm.name = xent->name;
- nm.len = le16_to_cpu(xent->nlen);
+ fname_name(&nm) = xent->name;
+ fname_len(&nm) = le16_to_cpu(xent->nlen);
- /* Show trusted namespace only for "power" users */
- if (strncmp(xent->name, XATTR_TRUSTED_PREFIX,
- XATTR_TRUSTED_PREFIX_LEN) ||
- capable(CAP_SYS_ADMIN)) {
- memcpy(buffer + written, nm.name, nm.len + 1);
- written += nm.len + 1;
+ if (xattr_visible(xent->name)) {
+ memcpy(buffer + written, fname_name(&nm), fname_len(&nm) + 1);
+ written += fname_len(&nm) + 1;
}
kfree(pxent);
@@ -446,7 +473,7 @@ ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size)
}
static int remove_xattr(struct ubifs_info *c, struct inode *host,
- struct inode *inode, const struct qstr *nm)
+ struct inode *inode, const struct fscrypt_name *nm)
{
int err;
struct ubifs_inode *host_ui = ubifs_inode(host);
@@ -463,9 +490,9 @@ static int remove_xattr(struct ubifs_info *c, struct inode *host,
mutex_lock(&host_ui->ui_mutex);
host->i_ctime = ubifs_current_time(host);
host_ui->xattr_cnt -= 1;
- host_ui->xattr_size -= CALC_DENT_SIZE(nm->len);
+ host_ui->xattr_size -= CALC_DENT_SIZE(fname_len(nm));
host_ui->xattr_size -= CALC_XATTR_BYTES(ui->data_len);
- host_ui->xattr_names -= nm->len;
+ host_ui->xattr_names -= fname_len(nm);
err = ubifs_jnl_delete_xattr(c, host, inode, nm);
if (err)
@@ -477,27 +504,27 @@ static int remove_xattr(struct ubifs_info *c, struct inode *host,
out_cancel:
host_ui->xattr_cnt += 1;
- host_ui->xattr_size += CALC_DENT_SIZE(nm->len);
+ host_ui->xattr_size += CALC_DENT_SIZE(fname_len(nm));
host_ui->xattr_size += CALC_XATTR_BYTES(ui->data_len);
- host_ui->xattr_names += nm->len;
+ host_ui->xattr_names += fname_len(nm);
mutex_unlock(&host_ui->ui_mutex);
ubifs_release_budget(c, &req);
make_bad_inode(inode);
return err;
}
-static int __ubifs_removexattr(struct inode *host, const char *name)
+static int ubifs_xattr_remove(struct inode *host, const char *name)
{
struct inode *inode;
struct ubifs_info *c = host->i_sb->s_fs_info;
- struct qstr nm = QSTR_INIT(name, strlen(name));
+ struct fscrypt_name nm = { .disk_name = FSTR_INIT((char *)name, strlen(name))};
struct ubifs_dent_node *xent;
union ubifs_key key;
int err;
ubifs_assert(inode_is_locked(host));
- if (nm.len > UBIFS_MAX_NLEN)
+ if (fname_len(&nm) > UBIFS_MAX_NLEN)
return -ENAMETOOLONG;
xent = kmalloc(UBIFS_MAX_XENT_NODE_SZ, GFP_NOFS);
@@ -548,7 +575,8 @@ static int init_xattrs(struct inode *inode, const struct xattr *xattr_array,
}
strcpy(name, XATTR_SECURITY_PREFIX);
strcpy(name + XATTR_SECURITY_PREFIX_LEN, xattr->name);
- err = __ubifs_setxattr(inode, name, xattr->value, xattr->value_len, 0);
+ err = ubifs_xattr_set(inode, name, xattr->value,
+ xattr->value_len, 0);
kfree(name);
if (err < 0)
break;
@@ -572,7 +600,7 @@ int ubifs_init_security(struct inode *dentry, struct inode *inode,
return err;
}
-static int ubifs_xattr_get(const struct xattr_handler *handler,
+static int xattr_get(const struct xattr_handler *handler,
struct dentry *dentry, struct inode *inode,
const char *name, void *buffer, size_t size)
{
@@ -580,10 +608,10 @@ static int ubifs_xattr_get(const struct xattr_handler *handler,
inode->i_ino, dentry, size);
name = xattr_full_name(handler, name);
- return __ubifs_getxattr(inode, name, buffer, size);
+ return ubifs_xattr_get(inode, name, buffer, size);
}
-static int ubifs_xattr_set(const struct xattr_handler *handler,
+static int xattr_set(const struct xattr_handler *handler,
struct dentry *dentry, struct inode *inode,
const char *name, const void *value,
size_t size, int flags)
@@ -594,27 +622,27 @@ static int ubifs_xattr_set(const struct xattr_handler *handler,
name = xattr_full_name(handler, name);
if (value)
- return __ubifs_setxattr(inode, name, value, size, flags);
+ return ubifs_xattr_set(inode, name, value, size, flags);
else
- return __ubifs_removexattr(inode, name);
+ return ubifs_xattr_remove(inode, name);
}
static const struct xattr_handler ubifs_user_xattr_handler = {
.prefix = XATTR_USER_PREFIX,
- .get = ubifs_xattr_get,
- .set = ubifs_xattr_set,
+ .get = xattr_get,
+ .set = xattr_set,
};
static const struct xattr_handler ubifs_trusted_xattr_handler = {
.prefix = XATTR_TRUSTED_PREFIX,
- .get = ubifs_xattr_get,
- .set = ubifs_xattr_set,
+ .get = xattr_get,
+ .set = xattr_set,
};
static const struct xattr_handler ubifs_security_xattr_handler = {
.prefix = XATTR_SECURITY_PREFIX,
- .get = ubifs_xattr_get,
- .set = ubifs_xattr_set,
+ .get = xattr_get,
+ .set = xattr_set,
};
const struct xattr_handler *ubifs_xattr_handlers[] = {
diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index aaec13c95253..2d0e028067eb 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -30,6 +30,7 @@
#include <linux/errno.h>
#include <linux/mm.h>
#include <linux/slab.h>
+#include <linux/bio.h>
#include "udf_i.h"
#include "udf_sb.h"
diff --git a/fs/udf/directory.c b/fs/udf/directory.c
index 988d5352bdb8..7aa48bd7cbaf 100644
--- a/fs/udf/directory.c
+++ b/fs/udf/directory.c
@@ -16,6 +16,7 @@
#include <linux/fs.h>
#include <linux/string.h>
+#include <linux/bio.h>
struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos,
struct udf_fileident_bh *fibh,
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index aad46401ede5..0f3db71753aa 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -38,6 +38,7 @@
#include <linux/crc-itu-t.h>
#include <linux/mpage.h>
#include <linux/uio.h>
+#include <linux/bio.h>
#include "udf_i.h"
#include "udf_sb.h"
diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index 67e085d591d8..a0376a2c1c29 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -15,6 +15,7 @@
#include <linux/buffer_head.h>
#include <linux/capability.h>
#include <linux/bitops.h>
+#include <linux/bio.h>
#include <asm/byteorder.h>
#include "ufs_fs.h"
@@ -306,8 +307,7 @@ static void ufs_change_blocknr(struct inode *inode, sector_t beg,
(unsigned long long)(pos + newb), pos);
bh->b_blocknr = newb + pos;
- unmap_underlying_metadata(bh->b_bdev,
- bh->b_blocknr);
+ clean_bdev_bh_alias(bh);
mark_buffer_dirty(bh);
++j;
bh = bh->b_this_page;
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 190d64be22ed..45ceb94e89e4 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -1070,8 +1070,7 @@ static int ufs_alloc_lastblock(struct inode *inode, loff_t size)
if (buffer_new(bh)) {
clear_buffer_new(bh);
- unmap_underlying_metadata(bh->b_bdev,
- bh->b_blocknr);
+ clean_bdev_bh_alias(bh);
/*
* we do not zeroize fragment, because of
* if it maped to hole, it already contains zeroes
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 85959d8324df..d96e2f30084b 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -257,9 +257,9 @@ out:
* fatal_signal_pending()s, and the mmap_sem must be released before
* returning it.
*/
-int handle_userfault(struct fault_env *fe, unsigned long reason)
+int handle_userfault(struct vm_fault *vmf, unsigned long reason)
{
- struct mm_struct *mm = fe->vma->vm_mm;
+ struct mm_struct *mm = vmf->vma->vm_mm;
struct userfaultfd_ctx *ctx;
struct userfaultfd_wait_queue uwq;
int ret;
@@ -268,7 +268,7 @@ int handle_userfault(struct fault_env *fe, unsigned long reason)
BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
ret = VM_FAULT_SIGBUS;
- ctx = fe->vma->vm_userfaultfd_ctx.ctx;
+ ctx = vmf->vma->vm_userfaultfd_ctx.ctx;
if (!ctx)
goto out;
@@ -301,17 +301,18 @@ int handle_userfault(struct fault_env *fe, unsigned long reason)
* without first stopping userland access to the memory. For
* VM_UFFD_MISSING userfaults this is enough for now.
*/
- if (unlikely(!(fe->flags & FAULT_FLAG_ALLOW_RETRY))) {
+ if (unlikely(!(vmf->flags & FAULT_FLAG_ALLOW_RETRY))) {
/*
* Validate the invariant that nowait must allow retry
* to be sure not to return SIGBUS erroneously on
* nowait invocations.
*/
- BUG_ON(fe->flags & FAULT_FLAG_RETRY_NOWAIT);
+ BUG_ON(vmf->flags & FAULT_FLAG_RETRY_NOWAIT);
#ifdef CONFIG_DEBUG_VM
if (printk_ratelimit()) {
printk(KERN_WARNING
- "FAULT_FLAG_ALLOW_RETRY missing %x\n", fe->flags);
+ "FAULT_FLAG_ALLOW_RETRY missing %x\n",
+ vmf->flags);
dump_stack();
}
#endif
@@ -323,7 +324,7 @@ int handle_userfault(struct fault_env *fe, unsigned long reason)
* and wait.
*/
ret = VM_FAULT_RETRY;
- if (fe->flags & FAULT_FLAG_RETRY_NOWAIT)
+ if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)
goto out;
/* take the reference before dropping the mmap_sem */
@@ -331,11 +332,11 @@ int handle_userfault(struct fault_env *fe, unsigned long reason)
init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function);
uwq.wq.private = current;
- uwq.msg = userfault_msg(fe->address, fe->flags, reason);
+ uwq.msg = userfault_msg(vmf->address, vmf->flags, reason);
uwq.ctx = ctx;
return_to_userland =
- (fe->flags & (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE)) ==
+ (vmf->flags & (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE)) ==
(FAULT_FLAG_USER|FAULT_FLAG_KILLABLE);
spin_lock(&ctx->fault_pending_wqh.lock);
@@ -353,7 +354,8 @@ int handle_userfault(struct fault_env *fe, unsigned long reason)
TASK_KILLABLE);
spin_unlock(&ctx->fault_pending_wqh.lock);
- must_wait = userfaultfd_must_wait(ctx, fe->address, fe->flags, reason);
+ must_wait = userfaultfd_must_wait(ctx, vmf->address, vmf->flags,
+ reason);
up_read(&mm->mmap_sem);
if (likely(must_wait && !ACCESS_ONCE(ctx->released) &&
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index effb64cf714f..5050056a0b06 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -2455,12 +2455,15 @@ xfs_agf_verify(
be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp)))
return false;
- if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) > XFS_BTREE_MAXLEVELS ||
+ if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) < 1 ||
+ be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) < 1 ||
+ be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) > XFS_BTREE_MAXLEVELS ||
be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) > XFS_BTREE_MAXLEVELS)
return false;
if (xfs_sb_version_hasrmapbt(&mp->m_sb) &&
- be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) > XFS_BTREE_MAXLEVELS)
+ (be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) < 1 ||
+ be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) > XFS_BTREE_MAXLEVELS))
return false;
/*
@@ -2477,7 +2480,8 @@ xfs_agf_verify(
return false;
if (xfs_sb_version_hasreflink(&mp->m_sb) &&
- be32_to_cpu(agf->agf_refcount_level) > XFS_BTREE_MAXLEVELS)
+ (be32_to_cpu(agf->agf_refcount_level) < 1 ||
+ be32_to_cpu(agf->agf_refcount_level) > XFS_BTREE_MAXLEVELS))
return false;
return true;;
diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c
index 5ba2dac5e67c..efb467b10a71 100644
--- a/fs/xfs/libxfs/xfs_alloc_btree.c
+++ b/fs/xfs/libxfs/xfs_alloc_btree.c
@@ -421,13 +421,17 @@ xfs_allocbt_init_cursor(
ASSERT(btnum == XFS_BTNUM_BNO || btnum == XFS_BTNUM_CNT);
- cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP);
+ cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS);
cur->bc_tp = tp;
cur->bc_mp = mp;
cur->bc_btnum = btnum;
cur->bc_blocklog = mp->m_sb.sb_blocklog;
cur->bc_ops = &xfs_allocbt_ops;
+ if (btnum == XFS_BTNUM_BNO)
+ cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtb_2);
+ else
+ cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtc_2);
if (btnum == XFS_BTNUM_CNT) {
cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]);
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index 8ea91f363093..2852521fc8ec 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -253,6 +253,7 @@ xfs_attr3_leaf_verify(
{
struct xfs_mount *mp = bp->b_target->bt_mount;
struct xfs_attr_leafblock *leaf = bp->b_addr;
+ struct xfs_perag *pag = bp->b_pag;
struct xfs_attr3_icleaf_hdr ichdr;
xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf);
@@ -273,7 +274,12 @@ xfs_attr3_leaf_verify(
if (ichdr.magic != XFS_ATTR_LEAF_MAGIC)
return false;
}
- if (ichdr.count == 0)
+ /*
+ * In recovery there is a transient state where count == 0 is valid
+ * because we may have transitioned an empty shortform attr to a leaf
+ * if the attr didn't fit in shortform.
+ */
+ if (pag && pag->pagf_init && ichdr.count == 0)
return false;
/* XXX: need to range check rest of attr header values */
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h
index 4f2aed04f827..f7dda0c237b0 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.h
+++ b/fs/xfs/libxfs/xfs_attr_leaf.h
@@ -51,7 +51,7 @@ int xfs_attr_shortform_getvalue(struct xfs_da_args *args);
int xfs_attr_shortform_to_leaf(struct xfs_da_args *args);
int xfs_attr_shortform_remove(struct xfs_da_args *args);
int xfs_attr_shortform_allfit(struct xfs_buf *bp, struct xfs_inode *dp);
-int xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes);
+int xfs_attr_shortform_bytesfit(struct xfs_inode *dp, int bytes);
void xfs_attr_fork_remove(struct xfs_inode *ip, struct xfs_trans *tp);
/*
@@ -77,7 +77,7 @@ int xfs_attr3_leaf_add(struct xfs_buf *leaf_buffer,
struct xfs_da_args *args);
int xfs_attr3_leaf_remove(struct xfs_buf *leaf_buffer,
struct xfs_da_args *args);
-int xfs_attr3_leaf_list_int(struct xfs_buf *bp,
+void xfs_attr3_leaf_list_int(struct xfs_buf *bp,
struct xfs_attr_list_context *context);
/*
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index c6eb21940783..2760bc3b2536 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -49,6 +49,8 @@
#include "xfs_rmap.h"
#include "xfs_ag_resv.h"
#include "xfs_refcount.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_icache.h"
kmem_zone_t *xfs_bmap_free_item_zone;
@@ -190,8 +192,12 @@ xfs_bmap_worst_indlen(
int maxrecs; /* maximum record count at this level */
xfs_mount_t *mp; /* mount structure */
xfs_filblks_t rval; /* return value */
+ xfs_filblks_t orig_len;
mp = ip->i_mount;
+
+ /* Calculate the worst-case size of the bmbt. */
+ orig_len = len;
maxrecs = mp->m_bmap_dmxr[0];
for (level = 0, rval = 0;
level < XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK);
@@ -199,12 +205,20 @@ xfs_bmap_worst_indlen(
len += maxrecs - 1;
do_div(len, maxrecs);
rval += len;
- if (len == 1)
- return rval + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) -
+ if (len == 1) {
+ rval += XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) -
level - 1;
+ break;
+ }
if (level == 0)
maxrecs = mp->m_bmap_dmxr[1];
}
+
+ /* Calculate the worst-case size of the rmapbt. */
+ if (xfs_sb_version_hasrmapbt(&mp->m_sb))
+ rval += 1 + xfs_rmapbt_calc_size(mp, orig_len) +
+ mp->m_rmap_maxlevels;
+
return rval;
}
@@ -504,7 +518,7 @@ void
xfs_bmap_trace_exlist(
xfs_inode_t *ip, /* incore inode pointer */
xfs_extnum_t cnt, /* count of entries in the list */
- int whichfork, /* data or attr fork */
+ int whichfork, /* data or attr or cow fork */
unsigned long caller_ip)
{
xfs_extnum_t idx; /* extent record index */
@@ -513,11 +527,13 @@ xfs_bmap_trace_exlist(
if (whichfork == XFS_ATTR_FORK)
state |= BMAP_ATTRFORK;
+ else if (whichfork == XFS_COW_FORK)
+ state |= BMAP_COWFORK;
ifp = XFS_IFORK_PTR(ip, whichfork);
- ASSERT(cnt == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)));
+ ASSERT(cnt == xfs_iext_count(ifp));
for (idx = 0; idx < cnt; idx++)
- trace_xfs_extlist(ip, idx, whichfork, caller_ip);
+ trace_xfs_extlist(ip, idx, state, caller_ip);
}
/*
@@ -811,7 +827,7 @@ try_another_ag:
XFS_BTREE_LONG_PTRS);
arp = XFS_BMBT_REC_ADDR(mp, ablock, 1);
- nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+ nextents = xfs_iext_count(ifp);
for (cnt = i = 0; i < nextents; i++) {
ep = xfs_iext_get_ext(ifp, i);
if (!isnullstartblock(xfs_bmbt_get_startblock(ep))) {
@@ -1137,6 +1153,10 @@ xfs_bmap_add_attrfork(
goto trans_cancel;
if (XFS_IFORK_Q(ip))
goto trans_cancel;
+ if (ip->i_d.di_anextents != 0) {
+ error = -EFSCORRUPTED;
+ goto trans_cancel;
+ }
if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS) {
/*
* For inodes coming from pre-6.2 filesystems.
@@ -1144,7 +1164,6 @@ xfs_bmap_add_attrfork(
ASSERT(ip->i_d.di_aformat == 0);
ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
}
- ASSERT(ip->i_d.di_anextents == 0);
xfs_trans_ijoin(tp, ip, 0);
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
@@ -1296,7 +1315,7 @@ xfs_bmap_read_extents(
/*
* Here with bp and block set to the leftmost leaf node in the tree.
*/
- room = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+ room = xfs_iext_count(ifp);
i = 0;
/*
* Loop over all leaf nodes. Copy information to the extent records.
@@ -1361,8 +1380,9 @@ xfs_bmap_read_extents(
return error;
block = XFS_BUF_TO_BLOCK(bp);
}
- ASSERT(i == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)));
- ASSERT(i == XFS_IFORK_NEXTENTS(ip, whichfork));
+ if (i != XFS_IFORK_NEXTENTS(ip, whichfork))
+ return -EFSCORRUPTED;
+ ASSERT(i == xfs_iext_count(ifp));
XFS_BMAP_TRACE_EXLIST(ip, i, whichfork);
return 0;
error0:
@@ -1370,97 +1390,6 @@ error0:
return -EFSCORRUPTED;
}
-
-/*
- * Search the extent records for the entry containing block bno.
- * If bno lies in a hole, point to the next entry. If bno lies
- * past eof, *eofp will be set, and *prevp will contain the last
- * entry (null if none). Else, *lastxp will be set to the index
- * of the found entry; *gotp will contain the entry.
- */
-STATIC xfs_bmbt_rec_host_t * /* pointer to found extent entry */
-xfs_bmap_search_multi_extents(
- xfs_ifork_t *ifp, /* inode fork pointer */
- xfs_fileoff_t bno, /* block number searched for */
- int *eofp, /* out: end of file found */
- xfs_extnum_t *lastxp, /* out: last extent index */
- xfs_bmbt_irec_t *gotp, /* out: extent entry found */
- xfs_bmbt_irec_t *prevp) /* out: previous extent entry found */
-{
- xfs_bmbt_rec_host_t *ep; /* extent record pointer */
- xfs_extnum_t lastx; /* last extent index */
-
- /*
- * Initialize the extent entry structure to catch access to
- * uninitialized br_startblock field.
- */
- gotp->br_startoff = 0xffa5a5a5a5a5a5a5LL;
- gotp->br_blockcount = 0xa55a5a5a5a5a5a5aLL;
- gotp->br_state = XFS_EXT_INVALID;
- gotp->br_startblock = 0xffffa5a5a5a5a5a5LL;
- prevp->br_startoff = NULLFILEOFF;
-
- ep = xfs_iext_bno_to_ext(ifp, bno, &lastx);
- if (lastx > 0) {
- xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx - 1), prevp);
- }
- if (lastx < (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))) {
- xfs_bmbt_get_all(ep, gotp);
- *eofp = 0;
- } else {
- if (lastx > 0) {
- *gotp = *prevp;
- }
- *eofp = 1;
- ep = NULL;
- }
- *lastxp = lastx;
- return ep;
-}
-
-/*
- * Search the extents list for the inode, for the extent containing bno.
- * If bno lies in a hole, point to the next entry. If bno lies past eof,
- * *eofp will be set, and *prevp will contain the last entry (null if none).
- * Else, *lastxp will be set to the index of the found
- * entry; *gotp will contain the entry.
- */
-xfs_bmbt_rec_host_t * /* pointer to found extent entry */
-xfs_bmap_search_extents(
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_fileoff_t bno, /* block number searched for */
- int fork, /* data or attr fork */
- int *eofp, /* out: end of file found */
- xfs_extnum_t *lastxp, /* out: last extent index */
- xfs_bmbt_irec_t *gotp, /* out: extent entry found */
- xfs_bmbt_irec_t *prevp) /* out: previous extent entry found */
-{
- xfs_ifork_t *ifp; /* inode fork pointer */
- xfs_bmbt_rec_host_t *ep; /* extent record pointer */
-
- XFS_STATS_INC(ip->i_mount, xs_look_exlist);
- ifp = XFS_IFORK_PTR(ip, fork);
-
- ep = xfs_bmap_search_multi_extents(ifp, bno, eofp, lastxp, gotp, prevp);
-
- if (unlikely(!(gotp->br_startblock) && (*lastxp != NULLEXTNUM) &&
- !(XFS_IS_REALTIME_INODE(ip) && fork == XFS_DATA_FORK))) {
- xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO,
- "Access to block zero in inode %llu "
- "start_block: %llx start_off: %llx "
- "blkcnt: %llx extent-state: %x lastx: %x",
- (unsigned long long)ip->i_ino,
- (unsigned long long)gotp->br_startblock,
- (unsigned long long)gotp->br_startoff,
- (unsigned long long)gotp->br_blockcount,
- gotp->br_state, *lastxp);
- *lastxp = NULLEXTNUM;
- *eofp = 1;
- return NULL;
- }
- return ep;
-}
-
/*
* Returns the file-relative block number of the first unused block(s)
* in the file with at least "len" logically contiguous blocks free.
@@ -1497,7 +1426,7 @@ xfs_bmap_first_unused(
(error = xfs_iread_extents(tp, ip, whichfork)))
return error;
lowest = *first_unused;
- nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+ nextents = xfs_iext_count(ifp);
for (idx = 0, lastaddr = 0, max = lowest; idx < nextents; idx++) {
xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, idx);
off = xfs_bmbt_get_startoff(ep);
@@ -1523,44 +1452,44 @@ xfs_bmap_first_unused(
*/
int /* error */
xfs_bmap_last_before(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_inode_t *ip, /* incore inode */
- xfs_fileoff_t *last_block, /* last block */
- int whichfork) /* data or attr fork */
+ struct xfs_trans *tp, /* transaction pointer */
+ struct xfs_inode *ip, /* incore inode */
+ xfs_fileoff_t *last_block, /* last block */
+ int whichfork) /* data or attr fork */
{
- xfs_fileoff_t bno; /* input file offset */
- int eof; /* hit end of file */
- xfs_bmbt_rec_host_t *ep; /* pointer to last extent */
- int error; /* error return value */
- xfs_bmbt_irec_t got; /* current extent value */
- xfs_ifork_t *ifp; /* inode fork pointer */
- xfs_extnum_t lastx; /* last extent used */
- xfs_bmbt_irec_t prev; /* previous extent value */
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_bmbt_irec got;
+ xfs_extnum_t idx;
+ int error;
- if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE &&
- XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
- XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL)
- return -EIO;
- if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
+ switch (XFS_IFORK_FORMAT(ip, whichfork)) {
+ case XFS_DINODE_FMT_LOCAL:
*last_block = 0;
return 0;
+ case XFS_DINODE_FMT_BTREE:
+ case XFS_DINODE_FMT_EXTENTS:
+ break;
+ default:
+ return -EIO;
}
- ifp = XFS_IFORK_PTR(ip, whichfork);
- if (!(ifp->if_flags & XFS_IFEXTENTS) &&
- (error = xfs_iread_extents(tp, ip, whichfork)))
- return error;
- bno = *last_block - 1;
- ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got,
- &prev);
- if (eof || xfs_bmbt_get_startoff(ep) > bno) {
- if (prev.br_startoff == NULLFILEOFF)
- *last_block = 0;
- else
- *last_block = prev.br_startoff + prev.br_blockcount;
+
+ if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+ error = xfs_iread_extents(tp, ip, whichfork);
+ if (error)
+ return error;
}
- /*
- * Otherwise *last_block is already the right answer.
- */
+
+ if (xfs_iext_lookup_extent(ip, ifp, *last_block - 1, &idx, &got)) {
+ if (got.br_startoff <= *last_block - 1)
+ return 0;
+ }
+
+ if (xfs_iext_get_extent(ifp, idx - 1, &got)) {
+ *last_block = got.br_startoff + got.br_blockcount;
+ return 0;
+ }
+
+ *last_block = 0;
return 0;
}
@@ -1582,7 +1511,7 @@ xfs_bmap_last_extent(
return error;
}
- nextents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
+ nextents = xfs_iext_count(ifp);
if (nextents == 0) {
*is_empty = 1;
return 0;
@@ -1735,7 +1664,7 @@ xfs_bmap_add_extent_delay_real(
&bma->ip->i_d.di_nextents);
ASSERT(bma->idx >= 0);
- ASSERT(bma->idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec));
+ ASSERT(bma->idx <= xfs_iext_count(ifp));
ASSERT(!isnullstartblock(new->br_startblock));
ASSERT(!bma->cur ||
(bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL));
@@ -1794,7 +1723,7 @@ xfs_bmap_add_extent_delay_real(
* Don't set contiguous if the combined extent would be too large.
* Also check for all-three-contiguous being too large.
*/
- if (bma->idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) {
+ if (bma->idx < xfs_iext_count(ifp) - 1) {
state |= BMAP_RIGHT_VALID;
xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx + 1), &RIGHT);
@@ -2300,7 +2229,7 @@ xfs_bmap_add_extent_unwritten_real(
ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
ASSERT(*idx >= 0);
- ASSERT(*idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec));
+ ASSERT(*idx <= xfs_iext_count(ifp));
ASSERT(!isnullstartblock(new->br_startblock));
XFS_STATS_INC(mp, xs_add_exlist);
@@ -2356,7 +2285,7 @@ xfs_bmap_add_extent_unwritten_real(
* Don't set contiguous if the combined extent would be too large.
* Also check for all-three-contiguous being too large.
*/
- if (*idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) {
+ if (*idx < xfs_iext_count(&ip->i_df) - 1) {
state |= BMAP_RIGHT_VALID;
xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx + 1), &RIGHT);
if (isnullstartblock(RIGHT.br_startblock))
@@ -2836,7 +2765,7 @@ xfs_bmap_add_extent_hole_delay(
* Check and set flags if the current (right) segment exists.
* If it doesn't exist, we're converting the hole at end-of-file.
*/
- if (*idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) {
+ if (*idx < xfs_iext_count(ifp)) {
state |= BMAP_RIGHT_VALID;
xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &right);
@@ -2966,7 +2895,7 @@ xfs_bmap_add_extent_hole_real(
ifp = XFS_IFORK_PTR(bma->ip, whichfork);
ASSERT(bma->idx >= 0);
- ASSERT(bma->idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec));
+ ASSERT(bma->idx <= xfs_iext_count(ifp));
ASSERT(!isnullstartblock(new->br_startblock));
ASSERT(!bma->cur ||
!(bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL));
@@ -2992,7 +2921,7 @@ xfs_bmap_add_extent_hole_real(
* Check and set flags if this segment has a current value.
* Not true if we're inserting into the "hole" at eof.
*/
- if (bma->idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) {
+ if (bma->idx < xfs_iext_count(ifp)) {
state |= BMAP_RIGHT_VALID;
xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx), &right);
if (isnullstartblock(right.br_startblock))
@@ -4145,12 +4074,11 @@ xfs_bmapi_read(
struct xfs_mount *mp = ip->i_mount;
struct xfs_ifork *ifp;
struct xfs_bmbt_irec got;
- struct xfs_bmbt_irec prev;
xfs_fileoff_t obno;
xfs_fileoff_t end;
- xfs_extnum_t lastx;
+ xfs_extnum_t idx;
int error;
- int eof;
+ bool eof = false;
int n = 0;
int whichfork = xfs_bmapi_whichfork(flags);
@@ -4190,7 +4118,8 @@ xfs_bmapi_read(
return error;
}
- xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got, &prev);
+ if (!xfs_iext_lookup_extent(ip, ifp, bno, &idx, &got))
+ eof = true;
end = bno + len;
obno = bno;
@@ -4221,10 +4150,8 @@ xfs_bmapi_read(
break;
/* Else go on to the next record. */
- if (++lastx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t))
- xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx), &got);
- else
- eof = 1;
+ if (!xfs_iext_get_extent(ifp, ++idx, &got))
+ eof = true;
}
*nmap = n;
return 0;
@@ -4234,10 +4161,10 @@ int
xfs_bmapi_reserve_delalloc(
struct xfs_inode *ip,
int whichfork,
- xfs_fileoff_t aoff,
+ xfs_fileoff_t off,
xfs_filblks_t len,
+ xfs_filblks_t prealloc,
struct xfs_bmbt_irec *got,
- struct xfs_bmbt_irec *prev,
xfs_extnum_t *lastx,
int eof)
{
@@ -4248,10 +4175,17 @@ xfs_bmapi_reserve_delalloc(
char rt = XFS_IS_REALTIME_INODE(ip);
xfs_extlen_t extsz;
int error;
+ xfs_fileoff_t aoff = off;
- alen = XFS_FILBLKS_MIN(len, MAXEXTLEN);
+ /*
+ * Cap the alloc length. Keep track of prealloc so we know whether to
+ * tag the inode before we return.
+ */
+ alen = XFS_FILBLKS_MIN(len + prealloc, MAXEXTLEN);
if (!eof)
alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff);
+ if (prealloc && alen >= len)
+ prealloc = alen - len;
/* Figure out the extent size, adjust alen */
if (whichfork == XFS_COW_FORK)
@@ -4259,7 +4193,12 @@ xfs_bmapi_reserve_delalloc(
else
extsz = xfs_get_extsz_hint(ip);
if (extsz) {
- error = xfs_bmap_extsize_align(mp, got, prev, extsz, rt, eof,
+ struct xfs_bmbt_irec prev;
+
+ if (!xfs_iext_get_extent(ifp, *lastx - 1, &prev))
+ prev.br_startoff = NULLFILEOFF;
+
+ error = xfs_bmap_extsize_align(mp, got, &prev, extsz, rt, eof,
1, 0, &aoff, &alen);
ASSERT(!error);
}
@@ -4312,6 +4251,16 @@ xfs_bmapi_reserve_delalloc(
*/
xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *lastx), got);
+ /*
+ * Tag the inode if blocks were preallocated. Note that COW fork
+ * preallocation can occur at the start or end of the extent, even when
+ * prealloc == 0, so we must also check the aligned offset and length.
+ */
+ if (whichfork == XFS_DATA_FORK && prealloc)
+ xfs_inode_set_eofblocks_tag(ip);
+ if (whichfork == XFS_COW_FORK && (prealloc || aoff < off || alen > len))
+ xfs_inode_set_cowblocks_tag(ip);
+
ASSERT(got->br_startoff <= aoff);
ASSERT(got->br_startoff + got->br_blockcount >= aoff + alen);
ASSERT(isnullstartblock(got->br_startblock));
@@ -4349,7 +4298,7 @@ xfs_bmapi_allocate(
if (bma->wasdel) {
bma->length = (xfs_extlen_t)bma->got.br_blockcount;
bma->offset = bma->got.br_startoff;
- if (bma->idx != NULLEXTNUM && bma->idx) {
+ if (bma->idx) {
xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx - 1),
&bma->prev);
}
@@ -4563,7 +4512,7 @@ xfs_bmapi_write(
struct xfs_ifork *ifp;
struct xfs_bmalloca bma = { NULL }; /* args for xfs_bmap_alloc */
xfs_fileoff_t end; /* end of mapped file region */
- int eof; /* after the end of extents */
+ bool eof = false; /* after the end of extents */
int error; /* error return */
int n; /* current extent index */
xfs_fileoff_t obno; /* old block number (offset) */
@@ -4641,12 +4590,14 @@ xfs_bmapi_write(
goto error0;
}
- xfs_bmap_search_extents(ip, bno, whichfork, &eof, &bma.idx, &bma.got,
- &bma.prev);
n = 0;
end = bno + len;
obno = bno;
+ if (!xfs_iext_lookup_extent(ip, ifp, bno, &bma.idx, &bma.got))
+ eof = true;
+ if (!xfs_iext_get_extent(ifp, bma.idx - 1, &bma.prev))
+ bma.prev.br_startoff = NULLFILEOFF;
bma.tp = tp;
bma.ip = ip;
bma.total = total;
@@ -4733,11 +4684,8 @@ xfs_bmapi_write(
/* Else go on to the next record. */
bma.prev = bma.got;
- if (++bma.idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t)) {
- xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma.idx),
- &bma.got);
- } else
- eof = 1;
+ if (!xfs_iext_get_extent(ifp, ++bma.idx, &bma.got))
+ eof = true;
}
*nmap = n;
@@ -4885,7 +4833,7 @@ xfs_bmap_del_extent_delay(
da_new = 0;
ASSERT(*idx >= 0);
- ASSERT(*idx < ifp->if_bytes / sizeof(struct xfs_bmbt_rec));
+ ASSERT(*idx <= xfs_iext_count(ifp));
ASSERT(del->br_blockcount > 0);
ASSERT(got->br_startoff <= del->br_startoff);
ASSERT(got_endoff >= del_endoff);
@@ -4902,8 +4850,11 @@ xfs_bmap_del_extent_delay(
* sb counters as we might have to borrow some blocks for the
* indirect block accounting.
*/
- xfs_trans_reserve_quota_nblks(NULL, ip, -((long)del->br_blockcount), 0,
+ error = xfs_trans_reserve_quota_nblks(NULL, ip,
+ -((long)del->br_blockcount), 0,
isrt ? XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS);
+ if (error)
+ return error;
ip->i_delayed_blks -= del->br_blockcount;
if (whichfork == XFS_COW_FORK)
@@ -5013,7 +4964,7 @@ xfs_bmap_del_extent_cow(
got_endoff = got->br_startoff + got->br_blockcount;
ASSERT(*idx >= 0);
- ASSERT(*idx < ifp->if_bytes / sizeof(struct xfs_bmbt_rec));
+ ASSERT(*idx <= xfs_iext_count(ifp));
ASSERT(del->br_blockcount > 0);
ASSERT(got->br_startoff <= del->br_startoff);
ASSERT(got_endoff >= del_endoff);
@@ -5119,8 +5070,7 @@ xfs_bmap_del_extent(
state |= BMAP_COWFORK;
ifp = XFS_IFORK_PTR(ip, whichfork);
- ASSERT((*idx >= 0) && (*idx < ifp->if_bytes /
- (uint)sizeof(xfs_bmbt_rec_t)));
+ ASSERT((*idx >= 0) && (*idx < xfs_iext_count(ifp)));
ASSERT(del->br_blockcount > 0);
ep = xfs_iext_get_ext(ifp, *idx);
xfs_bmbt_get_all(ep, &got);
@@ -5434,8 +5384,6 @@ __xfs_bunmapi(
{
xfs_btree_cur_t *cur; /* bmap btree cursor */
xfs_bmbt_irec_t del; /* extent being deleted */
- int eof; /* is deleting at eof */
- xfs_bmbt_rec_host_t *ep; /* extent record pointer */
int error; /* error return value */
xfs_extnum_t extno; /* extent number in list */
xfs_bmbt_irec_t got; /* current extent record */
@@ -5445,8 +5393,6 @@ __xfs_bunmapi(
int logflags; /* transaction logging flags */
xfs_extlen_t mod; /* rt extent offset */
xfs_mount_t *mp; /* mount structure */
- xfs_extnum_t nextents; /* number of file extents */
- xfs_bmbt_irec_t prev; /* previous extent record */
xfs_fileoff_t start; /* first file offset deleted */
int tmp_logflags; /* partial logging flags */
int wasdel; /* was a delayed alloc extent */
@@ -5477,8 +5423,7 @@ __xfs_bunmapi(
if (!(ifp->if_flags & XFS_IFEXTENTS) &&
(error = xfs_iread_extents(tp, ip, whichfork)))
return error;
- nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
- if (nextents == 0) {
+ if (xfs_iext_count(ifp) == 0) {
*rlen = 0;
return 0;
}
@@ -5486,18 +5431,17 @@ __xfs_bunmapi(
isrt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(ip);
start = bno;
bno = start + len - 1;
- ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got,
- &prev);
/*
* Check to see if the given block number is past the end of the
* file, back up to the last block if so...
*/
- if (eof) {
- ep = xfs_iext_get_ext(ifp, --lastx);
- xfs_bmbt_get_all(ep, &got);
+ if (!xfs_iext_lookup_extent(ip, ifp, bno, &lastx, &got)) {
+ ASSERT(lastx > 0);
+ xfs_iext_get_extent(ifp, --lastx, &got);
bno = got.br_startoff + got.br_blockcount - 1;
}
+
logflags = 0;
if (ifp->if_flags & XFS_IFBROOT) {
ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE);
@@ -5528,8 +5472,7 @@ __xfs_bunmapi(
if (got.br_startoff > bno) {
if (--lastx < 0)
break;
- ep = xfs_iext_get_ext(ifp, lastx);
- xfs_bmbt_get_all(ep, &got);
+ xfs_iext_get_extent(ifp, lastx, &got);
}
/*
* Is the last block of this extent before the range
@@ -5543,7 +5486,6 @@ __xfs_bunmapi(
* Then deal with the (possibly delayed) allocated space
* we found.
*/
- ASSERT(ep != NULL);
del = got;
wasdel = isnullstartblock(del.br_startblock);
if (got.br_startoff < start) {
@@ -5624,15 +5566,12 @@ __xfs_bunmapi(
*/
ASSERT(bno >= del.br_blockcount);
bno -= del.br_blockcount;
- if (got.br_startoff > bno) {
- if (--lastx >= 0) {
- ep = xfs_iext_get_ext(ifp,
- lastx);
- xfs_bmbt_get_all(ep, &got);
- }
- }
+ if (got.br_startoff > bno && --lastx >= 0)
+ xfs_iext_get_extent(ifp, lastx, &got);
continue;
} else if (del.br_state == XFS_EXT_UNWRITTEN) {
+ struct xfs_bmbt_irec prev;
+
/*
* This one is already unwritten.
* It must have a written left neighbor.
@@ -5640,8 +5579,7 @@ __xfs_bunmapi(
* try again.
*/
ASSERT(lastx > 0);
- xfs_bmbt_get_all(xfs_iext_get_ext(ifp,
- lastx - 1), &prev);
+ xfs_iext_get_extent(ifp, lastx - 1, &prev);
ASSERT(prev.br_state == XFS_EXT_NORM);
ASSERT(!isnullstartblock(prev.br_startblock));
ASSERT(del.br_startblock ==
@@ -5739,13 +5677,9 @@ nodelete:
*/
if (bno != (xfs_fileoff_t)-1 && bno >= start) {
if (lastx >= 0) {
- ep = xfs_iext_get_ext(ifp, lastx);
- if (xfs_bmbt_get_startoff(ep) > bno) {
- if (--lastx >= 0)
- ep = xfs_iext_get_ext(ifp,
- lastx);
- }
- xfs_bmbt_get_all(ep, &got);
+ xfs_iext_get_extent(ifp, lastx, &got);
+ if (got.br_startoff > bno && --lastx >= 0)
+ xfs_iext_get_extent(ifp, lastx, &got);
}
extno++;
}
@@ -5963,7 +5897,7 @@ xfs_bmse_shift_one(
mp = ip->i_mount;
ifp = XFS_IFORK_PTR(ip, whichfork);
- total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
+ total_extents = xfs_iext_count(ifp);
xfs_bmbt_get_all(gotp, &got);
@@ -6140,7 +6074,7 @@ xfs_bmap_shift_extents(
* are collapsing out, so we cannot use the count of real extents here.
* Instead we have to calculate it from the incore fork.
*/
- total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
+ total_extents = xfs_iext_count(ifp);
if (total_extents == 0) {
*done = 1;
goto del_cursor;
@@ -6200,7 +6134,7 @@ xfs_bmap_shift_extents(
* count can change. Update the total and grade the next record.
*/
if (direction == SHIFT_LEFT) {
- total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
+ total_extents = xfs_iext_count(ifp);
stop_extent = total_extents;
}
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index 7cae6ec27fa6..cecd094404cc 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -237,14 +237,9 @@ int xfs_bmap_shift_extents(struct xfs_trans *tp, struct xfs_inode *ip,
struct xfs_defer_ops *dfops, enum shift_direction direction,
int num_exts);
int xfs_bmap_split_extent(struct xfs_inode *ip, xfs_fileoff_t split_offset);
-struct xfs_bmbt_rec_host *
- xfs_bmap_search_extents(struct xfs_inode *ip, xfs_fileoff_t bno,
- int fork, int *eofp, xfs_extnum_t *lastxp,
- struct xfs_bmbt_irec *gotp, struct xfs_bmbt_irec *prevp);
int xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, int whichfork,
- xfs_fileoff_t aoff, xfs_filblks_t len,
- struct xfs_bmbt_irec *got, struct xfs_bmbt_irec *prev,
- xfs_extnum_t *lastx, int eof);
+ xfs_fileoff_t off, xfs_filblks_t len, xfs_filblks_t prealloc,
+ struct xfs_bmbt_irec *got, xfs_extnum_t *lastx, int eof);
enum xfs_bmap_intent_type {
XFS_BMAP_MAP = 1,
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index 8007d2ba9aef..d6330c297ca0 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -796,13 +796,14 @@ xfs_bmbt_init_cursor(
struct xfs_btree_cur *cur;
ASSERT(whichfork != XFS_COW_FORK);
- cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP);
+ cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS);
cur->bc_tp = tp;
cur->bc_mp = mp;
cur->bc_nlevels = be16_to_cpu(ifp->if_broot->bb_level) + 1;
cur->bc_btnum = XFS_BTNUM_BMAP;
cur->bc_blocklog = mp->m_sb.sb_blocklog;
+ cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_bmbt_2);
cur->bc_ops = &xfs_bmbt_ops;
cur->bc_flags = XFS_BTREE_LONG_PTRS | XFS_BTREE_ROOT_IN_INODE;
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index 0e80993c8a59..21e6a6ab6b9a 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -1769,8 +1769,28 @@ xfs_btree_lookup_get_block(
if (error)
return error;
+ /* Check the inode owner since the verifiers don't. */
+ if (xfs_sb_version_hascrc(&cur->bc_mp->m_sb) &&
+ (cur->bc_flags & XFS_BTREE_LONG_PTRS) &&
+ be64_to_cpu((*blkp)->bb_u.l.bb_owner) !=
+ cur->bc_private.b.ip->i_ino)
+ goto out_bad;
+
+ /* Did we get the level we were looking for? */
+ if (be16_to_cpu((*blkp)->bb_level) != level)
+ goto out_bad;
+
+ /* Check that internal nodes have at least one record. */
+ if (level != 0 && be16_to_cpu((*blkp)->bb_numrecs) == 0)
+ goto out_bad;
+
xfs_btree_setbuf(cur, level, bp);
return 0;
+
+out_bad:
+ *blkp = NULL;
+ xfs_trans_brelse(cur->bc_tp, bp);
+ return -EFSCORRUPTED;
}
/*
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index c2b01d1c79ee..b69b947c4c1b 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -96,46 +96,10 @@ union xfs_btree_rec {
/*
* Generic stats interface
*/
-#define __XFS_BTREE_STATS_INC(mp, type, stat) \
- XFS_STATS_INC(mp, xs_ ## type ## _2_ ## stat)
#define XFS_BTREE_STATS_INC(cur, stat) \
-do { \
- struct xfs_mount *__mp = cur->bc_mp; \
- switch (cur->bc_btnum) { \
- case XFS_BTNUM_BNO: __XFS_BTREE_STATS_INC(__mp, abtb, stat); break; \
- case XFS_BTNUM_CNT: __XFS_BTREE_STATS_INC(__mp, abtc, stat); break; \
- case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_INC(__mp, bmbt, stat); break; \
- case XFS_BTNUM_INO: __XFS_BTREE_STATS_INC(__mp, ibt, stat); break; \
- case XFS_BTNUM_FINO: __XFS_BTREE_STATS_INC(__mp, fibt, stat); break; \
- case XFS_BTNUM_RMAP: __XFS_BTREE_STATS_INC(__mp, rmap, stat); break; \
- case XFS_BTNUM_REFC: __XFS_BTREE_STATS_INC(__mp, refcbt, stat); break; \
- case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \
- } \
-} while (0)
-
-#define __XFS_BTREE_STATS_ADD(mp, type, stat, val) \
- XFS_STATS_ADD(mp, xs_ ## type ## _2_ ## stat, val)
-#define XFS_BTREE_STATS_ADD(cur, stat, val) \
-do { \
- struct xfs_mount *__mp = cur->bc_mp; \
- switch (cur->bc_btnum) { \
- case XFS_BTNUM_BNO: \
- __XFS_BTREE_STATS_ADD(__mp, abtb, stat, val); break; \
- case XFS_BTNUM_CNT: \
- __XFS_BTREE_STATS_ADD(__mp, abtc, stat, val); break; \
- case XFS_BTNUM_BMAP: \
- __XFS_BTREE_STATS_ADD(__mp, bmbt, stat, val); break; \
- case XFS_BTNUM_INO: \
- __XFS_BTREE_STATS_ADD(__mp, ibt, stat, val); break; \
- case XFS_BTNUM_FINO: \
- __XFS_BTREE_STATS_ADD(__mp, fibt, stat, val); break; \
- case XFS_BTNUM_RMAP: \
- __XFS_BTREE_STATS_ADD(__mp, rmap, stat, val); break; \
- case XFS_BTNUM_REFC: \
- __XFS_BTREE_STATS_ADD(__mp, refcbt, stat, val); break; \
- case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \
- } \
-} while (0)
+ XFS_STATS_INC_OFF((cur)->bc_mp, (cur)->bc_statoff + __XBTS_ ## stat)
+#define XFS_BTREE_STATS_ADD(cur, stat, val) \
+ XFS_STATS_ADD_OFF((cur)->bc_mp, (cur)->bc_statoff + __XBTS_ ## stat, val)
#define XFS_BTREE_MAXLEVELS 9 /* max of all btrees */
@@ -253,6 +217,7 @@ typedef struct xfs_btree_cur
__uint8_t bc_nlevels; /* number of levels in the tree */
__uint8_t bc_blocklog; /* log2(blocksize) of btree blocks */
xfs_btnum_t bc_btnum; /* identifies which btree type */
+ int bc_statoff; /* offset of btre stats array */
union {
struct { /* needed for BNO, CNT, INO */
struct xfs_buf *agbp; /* agf/agi buffer pointer */
diff --git a/fs/xfs/libxfs/xfs_cksum.h b/fs/xfs/libxfs/xfs_cksum.h
index fad1676ad8cd..a416c7cb23ea 100644
--- a/fs/xfs/libxfs/xfs_cksum.h
+++ b/fs/xfs/libxfs/xfs_cksum.h
@@ -6,10 +6,11 @@
/*
* Calculate the intermediate checksum for a buffer that has the CRC field
* inside it. The offset of the 32bit crc fields is passed as the
- * cksum_offset parameter.
+ * cksum_offset parameter. We do not modify the buffer during verification,
+ * hence we have to split the CRC calculation across the cksum_offset.
*/
static inline __uint32_t
-xfs_start_cksum(char *buffer, size_t length, unsigned long cksum_offset)
+xfs_start_cksum_safe(char *buffer, size_t length, unsigned long cksum_offset)
{
__uint32_t zero = 0;
__uint32_t crc;
@@ -26,6 +27,20 @@ xfs_start_cksum(char *buffer, size_t length, unsigned long cksum_offset)
}
/*
+ * Fast CRC method where the buffer is modified. Callers must have exclusive
+ * access to the buffer while the calculation takes place.
+ */
+static inline __uint32_t
+xfs_start_cksum_update(char *buffer, size_t length, unsigned long cksum_offset)
+{
+ /* zero the CRC field */
+ *(__le32 *)(buffer + cksum_offset) = 0;
+
+ /* single pass CRC calculation for the entire buffer */
+ return crc32c(XFS_CRC_SEED, buffer, length);
+}
+
+/*
* Convert the intermediate checksum to the final ondisk format.
*
* The CRC32c calculation uses LE format even on BE machines, but returns the
@@ -40,11 +55,14 @@ xfs_end_cksum(__uint32_t crc)
/*
* Helper to generate the checksum for a buffer.
+ *
+ * This modifies the buffer temporarily - callers must have exclusive
+ * access to the buffer while the calculation takes place.
*/
static inline void
xfs_update_cksum(char *buffer, size_t length, unsigned long cksum_offset)
{
- __uint32_t crc = xfs_start_cksum(buffer, length, cksum_offset);
+ __uint32_t crc = xfs_start_cksum_update(buffer, length, cksum_offset);
*(__le32 *)(buffer + cksum_offset) = xfs_end_cksum(crc);
}
@@ -55,7 +73,7 @@ xfs_update_cksum(char *buffer, size_t length, unsigned long cksum_offset)
static inline int
xfs_verify_cksum(char *buffer, size_t length, unsigned long cksum_offset)
{
- __uint32_t crc = xfs_start_cksum(buffer, length, cksum_offset);
+ __uint32_t crc = xfs_start_cksum_safe(buffer, length, cksum_offset);
return *(__le32 *)(buffer + cksum_offset) == xfs_end_cksum(crc);
}
diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c
index 20a96dd5af7e..c58d72c220f5 100644
--- a/fs/xfs/libxfs/xfs_dir2.c
+++ b/fs/xfs/libxfs/xfs_dir2.c
@@ -93,7 +93,7 @@ xfs_ascii_ci_compname(
return result;
}
-static struct xfs_nameops xfs_ascii_ci_nameops = {
+static const struct xfs_nameops xfs_ascii_ci_nameops = {
.hashname = xfs_ascii_ci_hashname,
.compname = xfs_ascii_ci_compname,
};
diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h
index becc926c3e3d..0197590fa7d7 100644
--- a/fs/xfs/libxfs/xfs_dir2.h
+++ b/fs/xfs/libxfs/xfs_dir2.h
@@ -157,6 +157,9 @@ extern int xfs_dir2_isleaf(struct xfs_da_args *args, int *r);
extern int xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db,
struct xfs_buf *bp);
+extern void xfs_dir2_data_freescan_int(struct xfs_da_geometry *geo,
+ const struct xfs_dir_ops *ops,
+ struct xfs_dir2_data_hdr *hdr, int *loghead);
extern void xfs_dir2_data_freescan(struct xfs_inode *dp,
struct xfs_dir2_data_hdr *hdr, int *loghead);
extern void xfs_dir2_data_log_entry(struct xfs_da_args *args,
@@ -177,6 +180,8 @@ extern struct xfs_dir2_data_free *xfs_dir2_data_freefind(
struct xfs_dir2_data_hdr *hdr, struct xfs_dir2_data_free *bf,
struct xfs_dir2_data_unused *dup);
+extern int xfs_dir_ino_validate(struct xfs_mount *mp, xfs_ino_t ino);
+
extern const struct xfs_buf_ops xfs_dir3_block_buf_ops;
extern const struct xfs_buf_ops xfs_dir3_leafn_buf_ops;
extern const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops;
diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c
index 725fc7841fde..d478065b9544 100644
--- a/fs/xfs/libxfs/xfs_dir2_data.c
+++ b/fs/xfs/libxfs/xfs_dir2_data.c
@@ -329,7 +329,7 @@ xfs_dir3_data_read(
err = xfs_da_read_buf(tp, dp, bno, mapped_bno, bpp,
XFS_DATA_FORK, &xfs_dir3_data_buf_ops);
- if (!err && tp)
+ if (!err && tp && *bpp)
xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_DATA_BUF);
return err;
}
@@ -505,8 +505,9 @@ xfs_dir2_data_freeremove(
* Given a data block, reconstruct its bestfree map.
*/
void
-xfs_dir2_data_freescan(
- struct xfs_inode *dp,
+xfs_dir2_data_freescan_int(
+ struct xfs_da_geometry *geo,
+ const struct xfs_dir_ops *ops,
struct xfs_dir2_data_hdr *hdr,
int *loghead)
{
@@ -516,7 +517,6 @@ xfs_dir2_data_freescan(
struct xfs_dir2_data_free *bf;
char *endp; /* end of block's data */
char *p; /* current entry pointer */
- struct xfs_da_geometry *geo = dp->i_mount->m_dir_geo;
ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
@@ -526,13 +526,13 @@ xfs_dir2_data_freescan(
/*
* Start by clearing the table.
*/
- bf = dp->d_ops->data_bestfree_p(hdr);
+ bf = ops->data_bestfree_p(hdr);
memset(bf, 0, sizeof(*bf) * XFS_DIR2_DATA_FD_COUNT);
*loghead = 1;
/*
* Set up pointers.
*/
- p = (char *)dp->d_ops->data_entry_p(hdr);
+ p = (char *)ops->data_entry_p(hdr);
if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) {
btp = xfs_dir2_block_tail_p(geo, hdr);
@@ -559,12 +559,22 @@ xfs_dir2_data_freescan(
else {
dep = (xfs_dir2_data_entry_t *)p;
ASSERT((char *)dep - (char *)hdr ==
- be16_to_cpu(*dp->d_ops->data_entry_tag_p(dep)));
- p += dp->d_ops->data_entsize(dep->namelen);
+ be16_to_cpu(*ops->data_entry_tag_p(dep)));
+ p += ops->data_entsize(dep->namelen);
}
}
}
+void
+xfs_dir2_data_freescan(
+ struct xfs_inode *dp,
+ struct xfs_dir2_data_hdr *hdr,
+ int *loghead)
+{
+ return xfs_dir2_data_freescan_int(dp->i_mount->m_dir_geo, dp->d_ops,
+ hdr, loghead);
+}
+
/*
* Initialize a data block at the given block number in the directory.
* Give back the buffer for the created block.
diff --git a/fs/xfs/libxfs/xfs_dir2_priv.h b/fs/xfs/libxfs/xfs_dir2_priv.h
index ef9f6ead96a4..d04547fcf274 100644
--- a/fs/xfs/libxfs/xfs_dir2_priv.h
+++ b/fs/xfs/libxfs/xfs_dir2_priv.h
@@ -21,7 +21,6 @@
struct dir_context;
/* xfs_dir2.c */
-extern int xfs_dir_ino_validate(struct xfs_mount *mp, xfs_ino_t ino);
extern int xfs_dir2_grow_inode(struct xfs_da_args *args, int space,
xfs_dir2_db_t *dbp);
extern int xfs_dir_cilookup_result(struct xfs_da_args *args,
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index 51b4e0de1fdc..f272abff11e1 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -2344,7 +2344,8 @@ xfs_imap(
imap->im_blkno = XFS_AGB_TO_DADDR(mp, agno, agbno);
imap->im_len = XFS_FSB_TO_BB(mp, 1);
- imap->im_boffset = (ushort)(offset << mp->m_sb.sb_inodelog);
+ imap->im_boffset = (unsigned short)(offset <<
+ mp->m_sb.sb_inodelog);
return 0;
}
@@ -2372,7 +2373,7 @@ out_map:
imap->im_blkno = XFS_AGB_TO_DADDR(mp, agno, cluster_agbno);
imap->im_len = XFS_FSB_TO_BB(mp, blks_per_cluster);
- imap->im_boffset = (ushort)(offset << mp->m_sb.sb_inodelog);
+ imap->im_boffset = (unsigned short)(offset << mp->m_sb.sb_inodelog);
/*
* If the inode number maps to a block outside the bounds
@@ -2450,8 +2451,6 @@ xfs_ialloc_log_agi(
ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC));
#endif
- xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGI_BUF);
-
/*
* Compute byte offsets for the first and last fields in the first
* region and log the agi buffer. This only logs up through
@@ -2512,8 +2511,15 @@ xfs_agi_verify(
if (!XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum)))
return false;
- if (be32_to_cpu(agi->agi_level) > XFS_BTREE_MAXLEVELS)
+ if (be32_to_cpu(agi->agi_level) < 1 ||
+ be32_to_cpu(agi->agi_level) > XFS_BTREE_MAXLEVELS)
+ return false;
+
+ if (xfs_sb_version_hasfinobt(&mp->m_sb) &&
+ (be32_to_cpu(agi->agi_free_level) < 1 ||
+ be32_to_cpu(agi->agi_free_level) > XFS_BTREE_MAXLEVELS))
return false;
+
/*
* during growfs operations, the perag is not fully initialised,
* so we can't use it for any useful checking. growfs ensures we can't
@@ -2592,6 +2598,8 @@ xfs_read_agi(
XFS_FSS_TO_BB(mp, 1), 0, bpp, &xfs_agi_buf_ops);
if (error)
return error;
+ if (tp)
+ xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_AGI_BUF);
xfs_buf_set_ref(*bpp, XFS_AGI_REF);
return 0;
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index eab68ae2e011..0fd086d03d41 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -357,7 +357,7 @@ xfs_inobt_init_cursor(
struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp);
struct xfs_btree_cur *cur;
- cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP);
+ cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS);
cur->bc_tp = tp;
cur->bc_mp = mp;
@@ -365,9 +365,11 @@ xfs_inobt_init_cursor(
if (btnum == XFS_BTNUM_INO) {
cur->bc_nlevels = be32_to_cpu(agi->agi_level);
cur->bc_ops = &xfs_inobt_ops;
+ cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_ibt_2);
} else {
cur->bc_nlevels = be32_to_cpu(agi->agi_free_level);
cur->bc_ops = &xfs_finobt_ops;
+ cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_fibt_2);
}
cur->bc_blocklog = mp->m_sb.sb_blocklog;
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index 134424fac434..dd483e2767f7 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -383,7 +383,7 @@ xfs_log_dinode_to_disk(
static bool
xfs_dinode_verify(
struct xfs_mount *mp,
- struct xfs_inode *ip,
+ xfs_ino_t ino,
struct xfs_dinode *dip)
{
uint16_t flags;
@@ -392,6 +392,14 @@ xfs_dinode_verify(
if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))
return false;
+ /* don't allow invalid i_size */
+ if (be64_to_cpu(dip->di_size) & (1ULL << 63))
+ return false;
+
+ /* No zero-length symlinks. */
+ if (S_ISLNK(be16_to_cpu(dip->di_mode)) && dip->di_size == 0)
+ return false;
+
/* only version 3 or greater inodes are extensively verified here */
if (dip->di_version < 3)
return true;
@@ -401,7 +409,7 @@ xfs_dinode_verify(
if (!xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize,
XFS_DINODE_CRC_OFF))
return false;
- if (be64_to_cpu(dip->di_ino) != ip->i_ino)
+ if (be64_to_cpu(dip->di_ino) != ino)
return false;
if (!uuid_equal(&dip->di_uuid, &mp->m_sb.sb_meta_uuid))
return false;
@@ -436,7 +444,7 @@ xfs_dinode_calc_crc(
return;
ASSERT(xfs_sb_version_hascrc(&mp->m_sb));
- crc = xfs_start_cksum((char *)dip, mp->m_sb.sb_inodesize,
+ crc = xfs_start_cksum_update((char *)dip, mp->m_sb.sb_inodesize,
XFS_DINODE_CRC_OFF);
dip->di_crc = xfs_end_cksum(crc);
}
@@ -493,7 +501,7 @@ xfs_iread(
return error;
/* even unallocated inodes are verified */
- if (!xfs_dinode_verify(mp, ip, dip)) {
+ if (!xfs_dinode_verify(mp, ip->i_ino, dip)) {
xfs_alert(mp, "%s: validation failed for inode %lld failed",
__func__, ip->i_ino);
diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h
index 3cfe12a4f58a..6848a0afbce7 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.h
+++ b/fs/xfs/libxfs/xfs_inode_buf.h
@@ -58,8 +58,8 @@ struct xfs_icdinode {
*/
struct xfs_imap {
xfs_daddr_t im_blkno; /* starting BB of inode chunk */
- ushort im_len; /* length in BBs of inode chunk */
- ushort im_boffset; /* inode offset in block in bytes */
+ unsigned short im_len; /* length in BBs of inode chunk */
+ unsigned short im_boffset; /* inode offset in block in bytes */
};
int xfs_imap_to_bp(struct xfs_mount *, struct xfs_trans *,
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index 5dd56d3dbb3a..222e103356c6 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -775,6 +775,13 @@ xfs_idestroy_fork(
}
}
+/* Count number of incore extents based on if_bytes */
+xfs_extnum_t
+xfs_iext_count(struct xfs_ifork *ifp)
+{
+ return ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+}
+
/*
* Convert in-core extents to on-disk form
*
@@ -803,7 +810,7 @@ xfs_iextents_copy(
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
ASSERT(ifp->if_bytes > 0);
- nrecs = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+ nrecs = xfs_iext_count(ifp);
XFS_BMAP_TRACE_EXLIST(ip, nrecs, whichfork);
ASSERT(nrecs > 0);
@@ -941,7 +948,7 @@ xfs_iext_get_ext(
xfs_extnum_t idx) /* index of target extent */
{
ASSERT(idx >= 0);
- ASSERT(idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t));
+ ASSERT(idx < xfs_iext_count(ifp));
if ((ifp->if_flags & XFS_IFEXTIREC) && (idx == 0)) {
return ifp->if_u1.if_ext_irec->er_extbuf;
@@ -1017,7 +1024,7 @@ xfs_iext_add(
int new_size; /* size of extents after adding */
xfs_extnum_t nextents; /* number of extents in file */
- nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+ nextents = xfs_iext_count(ifp);
ASSERT((idx >= 0) && (idx <= nextents));
byte_diff = ext_diff * sizeof(xfs_bmbt_rec_t);
new_size = ifp->if_bytes + byte_diff;
@@ -1241,7 +1248,7 @@ xfs_iext_remove(
trace_xfs_iext_remove(ip, idx, state, _RET_IP_);
ASSERT(ext_diff > 0);
- nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+ nextents = xfs_iext_count(ifp);
new_size = (nextents - ext_diff) * sizeof(xfs_bmbt_rec_t);
if (new_size == 0) {
@@ -1270,7 +1277,7 @@ xfs_iext_remove_inline(
ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
ASSERT(idx < XFS_INLINE_EXTS);
- nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+ nextents = xfs_iext_count(ifp);
ASSERT(((nextents - ext_diff) > 0) &&
(nextents - ext_diff) < XFS_INLINE_EXTS);
@@ -1309,7 +1316,7 @@ xfs_iext_remove_direct(
ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
new_size = ifp->if_bytes -
(ext_diff * sizeof(xfs_bmbt_rec_t));
- nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+ nextents = xfs_iext_count(ifp);
if (new_size == 0) {
xfs_iext_destroy(ifp);
@@ -1546,7 +1553,7 @@ xfs_iext_indirect_to_direct(
int size; /* size of file extents */
ASSERT(ifp->if_flags & XFS_IFEXTIREC);
- nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+ nextents = xfs_iext_count(ifp);
ASSERT(nextents <= XFS_LINEAR_EXTS);
size = nextents * sizeof(xfs_bmbt_rec_t);
@@ -1620,7 +1627,7 @@ xfs_iext_bno_to_ext(
xfs_extnum_t nextents; /* number of file extents */
xfs_fileoff_t startoff = 0; /* start offset of extent */
- nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+ nextents = xfs_iext_count(ifp);
if (nextents == 0) {
*idxp = 0;
return NULL;
@@ -1733,8 +1740,8 @@ xfs_iext_idx_to_irec(
ASSERT(ifp->if_flags & XFS_IFEXTIREC);
ASSERT(page_idx >= 0);
- ASSERT(page_idx <= ifp->if_bytes / sizeof(xfs_bmbt_rec_t));
- ASSERT(page_idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t) || realloc);
+ ASSERT(page_idx <= xfs_iext_count(ifp));
+ ASSERT(page_idx < xfs_iext_count(ifp) || realloc);
nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
erp_idx = 0;
@@ -1782,7 +1789,7 @@ xfs_iext_irec_init(
xfs_extnum_t nextents; /* number of extents in file */
ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
- nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+ nextents = xfs_iext_count(ifp);
ASSERT(nextents <= XFS_LINEAR_EXTS);
erp = kmem_alloc(sizeof(xfs_ext_irec_t), KM_NOFS);
@@ -1906,7 +1913,7 @@ xfs_iext_irec_compact(
ASSERT(ifp->if_flags & XFS_IFEXTIREC);
nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
- nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+ nextents = xfs_iext_count(ifp);
if (nextents == 0) {
xfs_iext_destroy(ifp);
@@ -1996,3 +2003,49 @@ xfs_ifork_init_cow(
ip->i_cformat = XFS_DINODE_FMT_EXTENTS;
ip->i_cnextents = 0;
}
+
+/*
+ * Lookup the extent covering bno.
+ *
+ * If there is an extent covering bno return the extent index, and store the
+ * expanded extent structure in *gotp, and the extent index in *idx.
+ * If there is no extent covering bno, but there is an extent after it (e.g.
+ * it lies in a hole) return that extent in *gotp and its index in *idx
+ * instead.
+ * If bno is beyond the last extent return false, and return the index after
+ * the last valid index in *idxp.
+ */
+bool
+xfs_iext_lookup_extent(
+ struct xfs_inode *ip,
+ struct xfs_ifork *ifp,
+ xfs_fileoff_t bno,
+ xfs_extnum_t *idxp,
+ struct xfs_bmbt_irec *gotp)
+{
+ struct xfs_bmbt_rec_host *ep;
+
+ XFS_STATS_INC(ip->i_mount, xs_look_exlist);
+
+ ep = xfs_iext_bno_to_ext(ifp, bno, idxp);
+ if (!ep)
+ return false;
+ xfs_bmbt_get_all(ep, gotp);
+ return true;
+}
+
+/*
+ * Return true if there is an extent at index idx, and return the expanded
+ * extent structure at idx in that case. Else return false.
+ */
+bool
+xfs_iext_get_extent(
+ struct xfs_ifork *ifp,
+ xfs_extnum_t idx,
+ struct xfs_bmbt_irec *gotp)
+{
+ if (idx < 0 || idx >= xfs_iext_count(ifp))
+ return false;
+ xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), gotp);
+ return true;
+}
diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h
index c9476f50e32d..7fb8365326d1 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.h
+++ b/fs/xfs/libxfs/xfs_inode_fork.h
@@ -152,6 +152,7 @@ void xfs_init_local_fork(struct xfs_inode *, int, const void *, int);
struct xfs_bmbt_rec_host *
xfs_iext_get_ext(struct xfs_ifork *, xfs_extnum_t);
+xfs_extnum_t xfs_iext_count(struct xfs_ifork *);
void xfs_iext_insert(struct xfs_inode *, xfs_extnum_t, xfs_extnum_t,
struct xfs_bmbt_irec *, int);
void xfs_iext_add(struct xfs_ifork *, xfs_extnum_t, int);
@@ -181,6 +182,12 @@ void xfs_iext_irec_compact_pages(struct xfs_ifork *);
void xfs_iext_irec_compact_full(struct xfs_ifork *);
void xfs_iext_irec_update_extoffs(struct xfs_ifork *, int, int);
+bool xfs_iext_lookup_extent(struct xfs_inode *ip,
+ struct xfs_ifork *ifp, xfs_fileoff_t bno,
+ xfs_extnum_t *idxp, struct xfs_bmbt_irec *gotp);
+bool xfs_iext_get_extent(struct xfs_ifork *ifp, xfs_extnum_t idx,
+ struct xfs_bmbt_irec *gotp);
+
extern struct kmem_zone *xfs_ifork_zone;
extern void xfs_ifork_init_cow(struct xfs_inode *ip);
diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
index 083cdd6d6c28..7ae571f8e34a 100644
--- a/fs/xfs/libxfs/xfs_log_format.h
+++ b/fs/xfs/libxfs/xfs_log_format.h
@@ -481,8 +481,8 @@ static inline uint xfs_log_dinode_size(int version)
typedef struct xfs_buf_log_format {
unsigned short blf_type; /* buf log item type indicator */
unsigned short blf_size; /* size of this item */
- ushort blf_flags; /* misc state */
- ushort blf_len; /* number of blocks in this buf */
+ unsigned short blf_flags; /* misc state */
+ unsigned short blf_len; /* number of blocks in this buf */
__int64_t blf_blkno; /* starting blkno of this buf */
unsigned int blf_map_size; /* used size of data bitmap in words */
unsigned int blf_data_map[XFS_BLF_DATAMAP_SIZE]; /* dirty bitmap */
diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h
index 8e385f91d660..d9f65e2d5cc8 100644
--- a/fs/xfs/libxfs/xfs_log_recover.h
+++ b/fs/xfs/libxfs/xfs_log_recover.h
@@ -52,7 +52,7 @@ typedef struct xlog_recover {
struct list_head r_itemq; /* q for items */
} xlog_recover_t;
-#define ITEM_TYPE(i) (*(ushort *)(i)->ri_buf[0].i_addr)
+#define ITEM_TYPE(i) (*(unsigned short *)(i)->ri_buf[0].i_addr)
/*
* This is the number of entries in the l_buf_cancel_table used during
diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c
index 453bb2757ec2..6fb2215f8ff7 100644
--- a/fs/xfs/libxfs/xfs_refcount_btree.c
+++ b/fs/xfs/libxfs/xfs_refcount_btree.c
@@ -354,6 +354,7 @@ xfs_refcountbt_init_cursor(
cur->bc_btnum = XFS_BTNUM_REFC;
cur->bc_blocklog = mp->m_sb.sb_blocklog;
cur->bc_ops = &xfs_refcountbt_ops;
+ cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_refcbt_2);
cur->bc_nlevels = be32_to_cpu(agf->agf_refcount_level);
diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c
index 83e672ff7577..de25771764ba 100644
--- a/fs/xfs/libxfs/xfs_rmap_btree.c
+++ b/fs/xfs/libxfs/xfs_rmap_btree.c
@@ -484,6 +484,7 @@ xfs_rmapbt_init_cursor(
cur->bc_blocklog = mp->m_sb.sb_blocklog;
cur->bc_ops = &xfs_rmapbt_ops;
cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]);
+ cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_rmap_2);
cur->bc_private.a.agbp = agbp;
cur->bc_private.a.agno = agno;
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
index e2e1106c9fad..ea45584a9913 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -1016,4 +1016,3 @@ xfs_rtfree_extent(
}
return 0;
}
-
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index a70aec910626..2580262e4ea0 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -262,6 +262,12 @@ xfs_mount_validate_sb(
return -EFSCORRUPTED;
}
+ if (xfs_sb_version_hascrc(&mp->m_sb) &&
+ sbp->sb_blocksize < XFS_MIN_CRC_BLOCKSIZE) {
+ xfs_notice(mp, "v5 SB sanity check failed");
+ return -EFSCORRUPTED;
+ }
+
/*
* Until this is fixed only page-sized or smaller data blocks work.
*/
@@ -338,13 +344,16 @@ xfs_sb_quota_from_disk(struct xfs_sb *sbp)
XFS_PQUOTA_CHKD : XFS_GQUOTA_CHKD;
sbp->sb_qflags &= ~(XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD);
- if (sbp->sb_qflags & XFS_PQUOTA_ACCT) {
+ if (sbp->sb_qflags & XFS_PQUOTA_ACCT &&
+ sbp->sb_gquotino != NULLFSINO) {
/*
* In older version of superblock, on-disk superblock only
* has sb_gquotino, and in-core superblock has both sb_gquotino
* and sb_pquotino. But, only one of them is supported at any
* point of time. So, if PQUOTA is set in disk superblock,
- * copy over sb_gquotino to sb_pquotino.
+ * copy over sb_gquotino to sb_pquotino. The NULLFSINO test
+ * above is to make sure we don't do this twice and wipe them
+ * both out!
*/
sbp->sb_pquotino = sbp->sb_gquotino;
sbp->sb_gquotino = NULLFSINO;
diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h
index 8d74870468c2..717909f2f7b7 100644
--- a/fs/xfs/libxfs/xfs_types.h
+++ b/fs/xfs/libxfs/xfs_types.h
@@ -57,7 +57,6 @@ typedef __int64_t xfs_sfiloff_t; /* signed block number in a file */
#define NULLAGBLOCK ((xfs_agblock_t)-1)
#define NULLAGNUMBER ((xfs_agnumber_t)-1)
-#define NULLEXTNUM ((xfs_extnum_t)-1)
#define NULLCOMMITLSN ((xfs_lsn_t)-1)
@@ -75,11 +74,14 @@ typedef __int64_t xfs_sfiloff_t; /* signed block number in a file */
* Minimum and maximum blocksize and sectorsize.
* The blocksize upper limit is pretty much arbitrary.
* The sectorsize upper limit is due to sizeof(sb_sectsize).
+ * CRC enable filesystems use 512 byte inodes, meaning 512 byte block sizes
+ * cannot be used.
*/
#define XFS_MIN_BLOCKSIZE_LOG 9 /* i.e. 512 bytes */
#define XFS_MAX_BLOCKSIZE_LOG 16 /* i.e. 65536 bytes */
#define XFS_MIN_BLOCKSIZE (1 << XFS_MIN_BLOCKSIZE_LOG)
#define XFS_MAX_BLOCKSIZE (1 << XFS_MAX_BLOCKSIZE_LOG)
+#define XFS_MIN_CRC_BLOCKSIZE (1 << (XFS_MIN_BLOCKSIZE_LOG + 1))
#define XFS_MIN_SECTORSIZE_LOG 9 /* i.e. 512 bytes */
#define XFS_MAX_SECTORSIZE_LOG 15 /* i.e. 32768 bytes */
#define XFS_MIN_SECTORSIZE (1 << XFS_MIN_SECTORSIZE_LOG)
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 3e57a56cf829..0f56fcd3a5d5 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -37,11 +37,6 @@
#include <linux/pagevec.h>
#include <linux/writeback.h>
-/* flags for direct write completions */
-#define XFS_DIO_FLAG_UNWRITTEN (1 << 0)
-#define XFS_DIO_FLAG_APPEND (1 << 1)
-#define XFS_DIO_FLAG_COW (1 << 2)
-
/*
* structure owned by writepages passed to individual writepage calls
*/
@@ -495,8 +490,8 @@ xfs_submit_ioend(
ioend->io_bio->bi_private = ioend;
ioend->io_bio->bi_end_io = xfs_end_bio;
- bio_set_op_attrs(ioend->io_bio, REQ_OP_WRITE,
- (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : 0);
+ ioend->io_bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);
+
/*
* If we are failing the IO now, just mark the ioend with an
* error and finish it. This will run IO completion immediately
@@ -567,8 +562,7 @@ xfs_chain_bio(
bio_chain(ioend->io_bio, new);
bio_get(ioend->io_bio); /* for xfs_destroy_ioend */
- bio_set_op_attrs(ioend->io_bio, REQ_OP_WRITE,
- (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : 0);
+ ioend->io_bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);
submit_bio(ioend->io_bio);
ioend->io_bio = new;
}
@@ -777,7 +771,7 @@ xfs_map_cow(
{
struct xfs_inode *ip = XFS_I(inode);
struct xfs_bmbt_irec imap;
- bool is_cow = false, need_alloc = false;
+ bool is_cow = false;
int error;
/*
@@ -795,7 +789,7 @@ xfs_map_cow(
* Else we need to check if there is a COW mapping at this offset.
*/
xfs_ilock(ip, XFS_ILOCK_SHARED);
- is_cow = xfs_reflink_find_cow_mapping(ip, offset, &imap, &need_alloc);
+ is_cow = xfs_reflink_find_cow_mapping(ip, offset, &imap);
xfs_iunlock(ip, XFS_ILOCK_SHARED);
if (!is_cow)
@@ -805,7 +799,7 @@ xfs_map_cow(
* And if the COW mapping has a delayed extent here we need to
* allocate real space for it now.
*/
- if (need_alloc) {
+ if (isnullstartblock(imap.br_startblock)) {
error = xfs_iomap_write_allocate(ip, XFS_COW_FORK, offset,
&imap);
if (error)
@@ -1176,45 +1170,6 @@ xfs_vm_releasepage(
}
/*
- * When we map a DIO buffer, we may need to pass flags to
- * xfs_end_io_direct_write to tell it what kind of write IO we are doing.
- *
- * Note that for DIO, an IO to the highest supported file block offset (i.e.
- * 2^63 - 1FSB bytes) will result in the offset + count overflowing a signed 64
- * bit variable. Hence if we see this overflow, we have to assume that the IO is
- * extending the file size. We won't know for sure until IO completion is run
- * and the actual max write offset is communicated to the IO completion
- * routine.
- */
-static void
-xfs_map_direct(
- struct inode *inode,
- struct buffer_head *bh_result,
- struct xfs_bmbt_irec *imap,
- xfs_off_t offset,
- bool is_cow)
-{
- uintptr_t *flags = (uintptr_t *)&bh_result->b_private;
- xfs_off_t size = bh_result->b_size;
-
- trace_xfs_get_blocks_map_direct(XFS_I(inode), offset, size,
- ISUNWRITTEN(imap) ? XFS_IO_UNWRITTEN : is_cow ? XFS_IO_COW :
- XFS_IO_OVERWRITE, imap);
-
- if (ISUNWRITTEN(imap)) {
- *flags |= XFS_DIO_FLAG_UNWRITTEN;
- set_buffer_defer_completion(bh_result);
- } else if (is_cow) {
- *flags |= XFS_DIO_FLAG_COW;
- set_buffer_defer_completion(bh_result);
- }
- if (offset + size > i_size_read(inode) || offset + size < 0) {
- *flags |= XFS_DIO_FLAG_APPEND;
- set_buffer_defer_completion(bh_result);
- }
-}
-
-/*
* If this is O_DIRECT or the mpage code calling tell them how large the mapping
* is, so that we can avoid repeated get_blocks calls.
*
@@ -1254,52 +1209,12 @@ xfs_map_trim_size(
bh_result->b_size = mapping_size;
}
-/* Bounce unaligned directio writes to the page cache. */
static int
-xfs_bounce_unaligned_dio_write(
- struct xfs_inode *ip,
- xfs_fileoff_t offset_fsb,
- struct xfs_bmbt_irec *imap)
-{
- struct xfs_bmbt_irec irec;
- xfs_fileoff_t delta;
- bool shared;
- bool x;
- int error;
-
- irec = *imap;
- if (offset_fsb > irec.br_startoff) {
- delta = offset_fsb - irec.br_startoff;
- irec.br_blockcount -= delta;
- irec.br_startblock += delta;
- irec.br_startoff = offset_fsb;
- }
- error = xfs_reflink_trim_around_shared(ip, &irec, &shared, &x);
- if (error)
- return error;
-
- /*
- * We're here because we're trying to do a directio write to a
- * region that isn't aligned to a filesystem block. If any part
- * of the extent is shared, fall back to buffered mode to handle
- * the RMW. This is done by returning -EREMCHG ("remote addr
- * changed"), which is caught further up the call stack.
- */
- if (shared) {
- trace_xfs_reflink_bounce_dio_write(ip, imap);
- return -EREMCHG;
- }
- return 0;
-}
-
-STATIC int
-__xfs_get_blocks(
+xfs_get_blocks(
struct inode *inode,
sector_t iblock,
struct buffer_head *bh_result,
- int create,
- bool direct,
- bool dax_fault)
+ int create)
{
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
@@ -1310,11 +1225,8 @@ __xfs_get_blocks(
int nimaps = 1;
xfs_off_t offset;
ssize_t size;
- int new = 0;
- bool is_cow = false;
- bool need_alloc = false;
- BUG_ON(create && !direct);
+ BUG_ON(create);
if (XFS_FORCED_SHUTDOWN(mp))
return -EIO;
@@ -1323,7 +1235,7 @@ __xfs_get_blocks(
ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));
size = bh_result->b_size;
- if (!create && offset >= i_size_read(inode))
+ if (offset >= i_size_read(inode))
return 0;
/*
@@ -1338,52 +1250,12 @@ __xfs_get_blocks(
end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size);
offset_fsb = XFS_B_TO_FSBT(mp, offset);
- if (create && direct && xfs_is_reflink_inode(ip))
- is_cow = xfs_reflink_find_cow_mapping(ip, offset, &imap,
- &need_alloc);
- if (!is_cow) {
- error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
- &imap, &nimaps, XFS_BMAPI_ENTIRE);
- /*
- * Truncate an overwrite extent if there's a pending CoW
- * reservation before the end of this extent. This
- * forces us to come back to get_blocks to take care of
- * the CoW.
- */
- if (create && direct && nimaps &&
- imap.br_startblock != HOLESTARTBLOCK &&
- imap.br_startblock != DELAYSTARTBLOCK &&
- !ISUNWRITTEN(&imap))
- xfs_reflink_trim_irec_to_next_cow(ip, offset_fsb,
- &imap);
- }
- ASSERT(!need_alloc);
+ error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
+ &imap, &nimaps, XFS_BMAPI_ENTIRE);
if (error)
goto out_unlock;
- /* for DAX, we convert unwritten extents directly */
- if (create &&
- (!nimaps ||
- (imap.br_startblock == HOLESTARTBLOCK ||
- imap.br_startblock == DELAYSTARTBLOCK) ||
- (IS_DAX(inode) && ISUNWRITTEN(&imap)))) {
- /*
- * xfs_iomap_write_direct() expects the shared lock. It
- * is unlocked on return.
- */
- if (lockmode == XFS_ILOCK_EXCL)
- xfs_ilock_demote(ip, lockmode);
-
- error = xfs_iomap_write_direct(ip, offset, size,
- &imap, nimaps);
- if (error)
- return error;
- new = 1;
-
- trace_xfs_get_blocks_alloc(ip, offset, size,
- ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN
- : XFS_IO_DELALLOC, &imap);
- } else if (nimaps) {
+ if (nimaps) {
trace_xfs_get_blocks_found(ip, offset, size,
ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN
: XFS_IO_OVERWRITE, &imap);
@@ -1393,12 +1265,6 @@ __xfs_get_blocks(
goto out_unlock;
}
- if (IS_DAX(inode) && create) {
- ASSERT(!ISUNWRITTEN(&imap));
- /* zeroing is not needed at a higher layer */
- new = 0;
- }
-
/* trim mapping down to size requested */
xfs_map_trim_size(inode, iblock, bh_result, &imap, offset, size);
@@ -1408,50 +1274,14 @@ __xfs_get_blocks(
*/
if (imap.br_startblock != HOLESTARTBLOCK &&
imap.br_startblock != DELAYSTARTBLOCK &&
- (create || !ISUNWRITTEN(&imap))) {
- if (create && direct && !is_cow) {
- error = xfs_bounce_unaligned_dio_write(ip, offset_fsb,
- &imap);
- if (error)
- return error;
- }
-
+ !ISUNWRITTEN(&imap))
xfs_map_buffer(inode, bh_result, &imap, offset);
- if (ISUNWRITTEN(&imap))
- set_buffer_unwritten(bh_result);
- /* direct IO needs special help */
- if (create) {
- if (dax_fault)
- ASSERT(!ISUNWRITTEN(&imap));
- else
- xfs_map_direct(inode, bh_result, &imap, offset,
- is_cow);
- }
- }
/*
* If this is a realtime file, data may be on a different device.
* to that pointed to from the buffer_head b_bdev currently.
*/
bh_result->b_bdev = xfs_find_bdev_for_inode(inode);
-
- /*
- * If we previously allocated a block out beyond eof and we are now
- * coming back to use it then we will need to flag it as new even if it
- * has a disk address.
- *
- * With sub-block writes into unwritten extents we also need to mark
- * the buffer as new so that the unwritten parts of the buffer gets
- * correctly zeroed.
- */
- if (create &&
- ((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) ||
- (offset >= i_size_read(inode)) ||
- (new || ISUNWRITTEN(&imap))))
- set_buffer_new(bh_result);
-
- BUG_ON(direct && imap.br_startblock == DELAYSTARTBLOCK);
-
return 0;
out_unlock:
@@ -1459,110 +1289,6 @@ out_unlock:
return error;
}
-int
-xfs_get_blocks(
- struct inode *inode,
- sector_t iblock,
- struct buffer_head *bh_result,
- int create)
-{
- return __xfs_get_blocks(inode, iblock, bh_result, create, false, false);
-}
-
-int
-xfs_get_blocks_direct(
- struct inode *inode,
- sector_t iblock,
- struct buffer_head *bh_result,
- int create)
-{
- return __xfs_get_blocks(inode, iblock, bh_result, create, true, false);
-}
-
-int
-xfs_get_blocks_dax_fault(
- struct inode *inode,
- sector_t iblock,
- struct buffer_head *bh_result,
- int create)
-{
- return __xfs_get_blocks(inode, iblock, bh_result, create, true, true);
-}
-
-/*
- * Complete a direct I/O write request.
- *
- * xfs_map_direct passes us some flags in the private data to tell us what to
- * do. If no flags are set, then the write IO is an overwrite wholly within
- * the existing allocated file size and so there is nothing for us to do.
- *
- * Note that in this case the completion can be called in interrupt context,
- * whereas if we have flags set we will always be called in task context
- * (i.e. from a workqueue).
- */
-int
-xfs_end_io_direct_write(
- struct kiocb *iocb,
- loff_t offset,
- ssize_t size,
- void *private)
-{
- struct inode *inode = file_inode(iocb->ki_filp);
- struct xfs_inode *ip = XFS_I(inode);
- uintptr_t flags = (uintptr_t)private;
- int error = 0;
-
- trace_xfs_end_io_direct_write(ip, offset, size);
-
- if (XFS_FORCED_SHUTDOWN(ip->i_mount))
- return -EIO;
-
- if (size <= 0)
- return size;
-
- /*
- * The flags tell us whether we are doing unwritten extent conversions
- * or an append transaction that updates the on-disk file size. These
- * cases are the only cases where we should *potentially* be needing
- * to update the VFS inode size.
- */
- if (flags == 0) {
- ASSERT(offset + size <= i_size_read(inode));
- return 0;
- }
-
- /*
- * We need to update the in-core inode size here so that we don't end up
- * with the on-disk inode size being outside the in-core inode size. We
- * have no other method of updating EOF for AIO, so always do it here
- * if necessary.
- *
- * We need to lock the test/set EOF update as we can be racing with
- * other IO completions here to update the EOF. Failing to serialise
- * here can result in EOF moving backwards and Bad Things Happen when
- * that occurs.
- */
- spin_lock(&ip->i_flags_lock);
- if (offset + size > i_size_read(inode))
- i_size_write(inode, offset + size);
- spin_unlock(&ip->i_flags_lock);
-
- if (flags & XFS_DIO_FLAG_COW)
- error = xfs_reflink_end_cow(ip, offset, size);
- if (flags & XFS_DIO_FLAG_UNWRITTEN) {
- trace_xfs_end_io_direct_write_unwritten(ip, offset, size);
-
- error = xfs_iomap_write_unwritten(ip, offset, size);
- }
- if (flags & XFS_DIO_FLAG_APPEND) {
- trace_xfs_end_io_direct_write_append(ip, offset, size);
-
- error = xfs_setfilesize(ip, offset, size);
- }
-
- return error;
-}
-
STATIC ssize_t
xfs_vm_direct_IO(
struct kiocb *iocb,
@@ -1583,7 +1309,6 @@ xfs_vm_bmap(
struct xfs_inode *ip = XFS_I(inode);
trace_xfs_vm_bmap(XFS_I(inode));
- xfs_ilock(ip, XFS_IOLOCK_SHARED);
/*
* The swap code (ab-)uses ->bmap to get a block mapping and then
@@ -1591,12 +1316,10 @@ xfs_vm_bmap(
* that on reflinks inodes, so we have to skip out here. And yes,
* 0 is the magic code for a bmap error..
*/
- if (xfs_is_reflink_inode(ip)) {
- xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+ if (xfs_is_reflink_inode(ip))
return 0;
- }
+
filemap_write_and_wait(mapping);
- xfs_iunlock(ip, XFS_IOLOCK_SHARED);
return generic_block_bmap(mapping, block, xfs_get_blocks);
}
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index b3c6634f9518..cc174ec6c2fd 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -55,15 +55,6 @@ struct xfs_ioend {
extern const struct address_space_operations xfs_address_space_operations;
-int xfs_get_blocks(struct inode *inode, sector_t offset,
- struct buffer_head *map_bh, int create);
-int xfs_get_blocks_direct(struct inode *inode, sector_t offset,
- struct buffer_head *map_bh, int create);
-int xfs_get_blocks_dax_fault(struct inode *inode, sector_t offset,
- struct buffer_head *map_bh, int create);
-
-int xfs_end_io_direct_write(struct kiocb *iocb, loff_t offset,
- ssize_t size, void *private);
int xfs_setfilesize(struct xfs_inode *ip, xfs_off_t offset, size_t size);
extern void xfs_count_page_state(struct page *, int *, int *);
diff --git a/fs/xfs/xfs_attr.h b/fs/xfs/xfs_attr.h
index e3da5d448bcf..d14691aa02b4 100644
--- a/fs/xfs/xfs_attr.h
+++ b/fs/xfs/xfs_attr.h
@@ -112,8 +112,8 @@ typedef struct attrlist_cursor_kern {
*========================================================================*/
-/* Return 0 on success, or -errno; other state communicated via *context */
-typedef int (*put_listent_func_t)(struct xfs_attr_list_context *, int,
+/* void; state communicated via *context */
+typedef void (*put_listent_func_t)(struct xfs_attr_list_context *, int,
unsigned char *, int, int);
typedef struct xfs_attr_list_context {
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
index 25e76cd6c053..97c45b6eb91e 100644
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -74,7 +74,6 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
xfs_attr_sf_entry_t *sfe;
xfs_inode_t *dp;
int sbsize, nsbuf, count, i;
- int error;
ASSERT(context != NULL);
dp = context->dp;
@@ -102,13 +101,11 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
(XFS_ISRESET_CURSOR(cursor) &&
(dp->i_afp->if_bytes + sf->hdr.count * 16) < context->bufsize)) {
for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) {
- error = context->put_listent(context,
- sfe->flags,
- sfe->nameval,
- (int)sfe->namelen,
- (int)sfe->valuelen);
- if (error)
- return error;
+ context->put_listent(context,
+ sfe->flags,
+ sfe->nameval,
+ (int)sfe->namelen,
+ (int)sfe->valuelen);
/*
* Either search callback finished early or
* didn't fit it all in the buffer after all.
@@ -193,15 +190,11 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
cursor->hashval = sbp->hash;
cursor->offset = 0;
}
- error = context->put_listent(context,
- sbp->flags,
- sbp->name,
- sbp->namelen,
- sbp->valuelen);
- if (error) {
- kmem_free(sbuf);
- return error;
- }
+ context->put_listent(context,
+ sbp->flags,
+ sbp->name,
+ sbp->namelen,
+ sbp->valuelen);
if (context->seen_enough)
break;
cursor->offset++;
@@ -335,11 +328,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
*/
for (;;) {
leaf = bp->b_addr;
- error = xfs_attr3_leaf_list_int(bp, context);
- if (error) {
- xfs_trans_brelse(NULL, bp);
- return error;
- }
+ xfs_attr3_leaf_list_int(bp, context);
xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &leafhdr, leaf);
if (context->seen_enough || leafhdr.forw == 0)
break;
@@ -356,7 +345,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
/*
* Copy out attribute list entries for attr_list(), for leaf attribute lists.
*/
-int
+void
xfs_attr3_leaf_list_int(
struct xfs_buf *bp,
struct xfs_attr_list_context *context)
@@ -366,7 +355,6 @@ xfs_attr3_leaf_list_int(
struct xfs_attr3_icleaf_hdr ichdr;
struct xfs_attr_leaf_entry *entries;
struct xfs_attr_leaf_entry *entry;
- int retval;
int i;
struct xfs_mount *mp = context->dp->i_mount;
@@ -399,7 +387,7 @@ xfs_attr3_leaf_list_int(
}
if (i == ichdr.count) {
trace_xfs_attr_list_notfound(context);
- return 0;
+ return;
}
} else {
entry = &entries[0];
@@ -410,7 +398,6 @@ xfs_attr3_leaf_list_int(
/*
* We have found our place, start copying out the new attributes.
*/
- retval = 0;
for (; i < ichdr.count; entry++, i++) {
char *name;
int namelen, valuelen;
@@ -439,16 +426,14 @@ xfs_attr3_leaf_list_int(
valuelen = be32_to_cpu(name_rmt->valuelen);
}
- retval = context->put_listent(context, entry->flags,
+ context->put_listent(context, entry->flags,
name, namelen, valuelen);
- if (retval)
- break;
if (context->seen_enough)
break;
cursor->offset++;
}
trace_xfs_attr_list_leaf_end(context);
- return retval;
+ return;
}
/*
@@ -467,9 +452,9 @@ xfs_attr_leaf_list(xfs_attr_list_context_t *context)
if (error)
return error;
- error = xfs_attr3_leaf_list_int(bp, context);
+ xfs_attr3_leaf_list_int(bp, context);
xfs_trans_brelse(NULL, bp);
- return error;
+ return 0;
}
int
@@ -513,7 +498,7 @@ xfs_attr_list_int(
* Take care to check values and protect against them changing later,
* we may be reading them directly out of a user buffer.
*/
-STATIC int
+STATIC void
xfs_attr_put_listent(
xfs_attr_list_context_t *context,
int flags,
@@ -536,10 +521,10 @@ xfs_attr_put_listent(
*/
if (((context->flags & ATTR_SECURE) == 0) !=
((flags & XFS_ATTR_SECURE) == 0))
- return 0;
+ return;
if (((context->flags & ATTR_ROOT) == 0) !=
((flags & XFS_ATTR_ROOT) == 0))
- return 0;
+ return;
arraytop = sizeof(*alist) +
context->count * sizeof(alist->al_offset[0]);
@@ -548,7 +533,7 @@ xfs_attr_put_listent(
trace_xfs_attr_list_full(context);
alist->al_more = 1;
context->seen_enough = 1;
- return 0;
+ return;
}
aep = (attrlist_ent_t *)&context->alist[context->firstu];
@@ -558,7 +543,7 @@ xfs_attr_put_listent(
alist->al_offset[context->count++] = context->firstu;
alist->al_count = context->count;
trace_xfs_attr_list_add(context);
- return 0;
+ return;
}
/*
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 552465e011ec..b9abce524c33 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -359,9 +359,7 @@ xfs_bmap_count_blocks(
mp = ip->i_mount;
ifp = XFS_IFORK_PTR(ip, whichfork);
if ( XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS ) {
- xfs_bmap_count_leaves(ifp, 0,
- ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t),
- count);
+ xfs_bmap_count_leaves(ifp, 0, xfs_iext_count(ifp), count);
return 0;
}
@@ -426,7 +424,7 @@ xfs_getbmapx_fix_eof_hole(
ifp = XFS_IFORK_PTR(ip, whichfork);
if (!moretocome &&
xfs_iext_bno_to_ext(ifp, fileblock, &lastx) &&
- (lastx == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))-1))
+ (lastx == xfs_iext_count(ifp) - 1))
out->bmv_oflags |= BMV_OF_LAST;
}
@@ -1792,6 +1790,7 @@ xfs_swap_extent_forks(
struct xfs_ifork tempifp, *ifp, *tifp;
int aforkblks = 0;
int taforkblks = 0;
+ xfs_extnum_t nextents;
__uint64_t tmp;
int error;
@@ -1877,14 +1876,13 @@ xfs_swap_extent_forks(
switch (ip->i_d.di_format) {
case XFS_DINODE_FMT_EXTENTS:
- /* If the extents fit in the inode, fix the
- * pointer. Otherwise it's already NULL or
- * pointing to the extent.
+ /*
+ * If the extents fit in the inode, fix the pointer. Otherwise
+ * it's already NULL or pointing to the extent.
*/
- if (ip->i_d.di_nextents <= XFS_INLINE_EXTS) {
- ifp->if_u1.if_extents =
- ifp->if_u2.if_inline_ext;
- }
+ nextents = xfs_iext_count(&ip->i_df);
+ if (nextents <= XFS_INLINE_EXTS)
+ ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
(*src_log_flags) |= XFS_ILOG_DEXT;
break;
case XFS_DINODE_FMT_BTREE:
@@ -1896,14 +1894,13 @@ xfs_swap_extent_forks(
switch (tip->i_d.di_format) {
case XFS_DINODE_FMT_EXTENTS:
- /* If the extents fit in the inode, fix the
- * pointer. Otherwise it's already NULL or
- * pointing to the extent.
+ /*
+ * If the extents fit in the inode, fix the pointer. Otherwise
+ * it's already NULL or pointing to the extent.
*/
- if (tip->i_d.di_nextents <= XFS_INLINE_EXTS) {
- tifp->if_u1.if_extents =
- tifp->if_u2.if_inline_ext;
- }
+ nextents = xfs_iext_count(&tip->i_df);
+ if (nextents <= XFS_INLINE_EXTS)
+ tifp->if_u1.if_extents = tifp->if_u2.if_inline_ext;
(*target_log_flags) |= XFS_ILOG_DEXT;
break;
case XFS_DINODE_FMT_BTREE:
@@ -1938,8 +1935,8 @@ xfs_swap_extents(
* page cache safely. Once we have done this we can take the ilocks and
* do the rest of the checks.
*/
- lock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
- xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL);
+ lock_two_nondirectories(VFS_I(ip), VFS_I(tip));
+ lock_flags = XFS_MMAPLOCK_EXCL;
xfs_lock_two_inodes(ip, tip, XFS_MMAPLOCK_EXCL);
/* Verify that both files have the same format */
@@ -2079,15 +2076,13 @@ xfs_swap_extents(
trace_xfs_swap_extent_after(ip, 0);
trace_xfs_swap_extent_after(tip, 1);
+out_unlock:
xfs_iunlock(ip, lock_flags);
xfs_iunlock(tip, lock_flags);
+ unlock_two_nondirectories(VFS_I(ip), VFS_I(tip));
return error;
out_trans_cancel:
xfs_trans_cancel(tp);
-
-out_unlock:
- xfs_iunlock(ip, lock_flags);
- xfs_iunlock(tip, lock_flags);
- return error;
+ goto out_unlock;
}
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index b5b9bffe3520..7f0a01f7b592 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -219,7 +219,6 @@ _xfs_buf_alloc(
init_completion(&bp->b_iowait);
INIT_LIST_HEAD(&bp->b_lru);
INIT_LIST_HEAD(&bp->b_list);
- RB_CLEAR_NODE(&bp->b_rbnode);
sema_init(&bp->b_sema, 0); /* held, no waiters */
spin_lock_init(&bp->b_lock);
XB_SET_OWNER(bp);
@@ -473,6 +472,62 @@ _xfs_buf_map_pages(
/*
* Finding and Reading Buffers
*/
+static int
+_xfs_buf_obj_cmp(
+ struct rhashtable_compare_arg *arg,
+ const void *obj)
+{
+ const struct xfs_buf_map *map = arg->key;
+ const struct xfs_buf *bp = obj;
+
+ /*
+ * The key hashing in the lookup path depends on the key being the
+ * first element of the compare_arg, make sure to assert this.
+ */
+ BUILD_BUG_ON(offsetof(struct xfs_buf_map, bm_bn) != 0);
+
+ if (bp->b_bn != map->bm_bn)
+ return 1;
+
+ if (unlikely(bp->b_length != map->bm_len)) {
+ /*
+ * found a block number match. If the range doesn't
+ * match, the only way this is allowed is if the buffer
+ * in the cache is stale and the transaction that made
+ * it stale has not yet committed. i.e. we are
+ * reallocating a busy extent. Skip this buffer and
+ * continue searching for an exact match.
+ */
+ ASSERT(bp->b_flags & XBF_STALE);
+ return 1;
+ }
+ return 0;
+}
+
+static const struct rhashtable_params xfs_buf_hash_params = {
+ .min_size = 32, /* empty AGs have minimal footprint */
+ .nelem_hint = 16,
+ .key_len = sizeof(xfs_daddr_t),
+ .key_offset = offsetof(struct xfs_buf, b_bn),
+ .head_offset = offsetof(struct xfs_buf, b_rhash_head),
+ .automatic_shrinking = true,
+ .obj_cmpfn = _xfs_buf_obj_cmp,
+};
+
+int
+xfs_buf_hash_init(
+ struct xfs_perag *pag)
+{
+ spin_lock_init(&pag->pag_buf_lock);
+ return rhashtable_init(&pag->pag_buf_hash, &xfs_buf_hash_params);
+}
+
+void
+xfs_buf_hash_destroy(
+ struct xfs_perag *pag)
+{
+ rhashtable_destroy(&pag->pag_buf_hash);
+}
/*
* Look up, and creates if absent, a lockable buffer for
@@ -488,27 +543,24 @@ _xfs_buf_find(
xfs_buf_t *new_bp)
{
struct xfs_perag *pag;
- struct rb_node **rbp;
- struct rb_node *parent;
xfs_buf_t *bp;
- xfs_daddr_t blkno = map[0].bm_bn;
+ struct xfs_buf_map cmap = { .bm_bn = map[0].bm_bn };
xfs_daddr_t eofs;
- int numblks = 0;
int i;
for (i = 0; i < nmaps; i++)
- numblks += map[i].bm_len;
+ cmap.bm_len += map[i].bm_len;
/* Check for IOs smaller than the sector size / not sector aligned */
- ASSERT(!(BBTOB(numblks) < btp->bt_meta_sectorsize));
- ASSERT(!(BBTOB(blkno) & (xfs_off_t)btp->bt_meta_sectormask));
+ ASSERT(!(BBTOB(cmap.bm_len) < btp->bt_meta_sectorsize));
+ ASSERT(!(BBTOB(cmap.bm_bn) & (xfs_off_t)btp->bt_meta_sectormask));
/*
* Corrupted block numbers can get through to here, unfortunately, so we
* have to check that the buffer falls within the filesystem bounds.
*/
eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks);
- if (blkno < 0 || blkno >= eofs) {
+ if (cmap.bm_bn < 0 || cmap.bm_bn >= eofs) {
/*
* XXX (dgc): we should really be returning -EFSCORRUPTED here,
* but none of the higher level infrastructure supports
@@ -516,53 +568,29 @@ _xfs_buf_find(
*/
xfs_alert(btp->bt_mount,
"%s: Block out of range: block 0x%llx, EOFS 0x%llx ",
- __func__, blkno, eofs);
+ __func__, cmap.bm_bn, eofs);
WARN_ON(1);
return NULL;
}
- /* get tree root */
pag = xfs_perag_get(btp->bt_mount,
- xfs_daddr_to_agno(btp->bt_mount, blkno));
+ xfs_daddr_to_agno(btp->bt_mount, cmap.bm_bn));
- /* walk tree */
spin_lock(&pag->pag_buf_lock);
- rbp = &pag->pag_buf_tree.rb_node;
- parent = NULL;
- bp = NULL;
- while (*rbp) {
- parent = *rbp;
- bp = rb_entry(parent, struct xfs_buf, b_rbnode);
-
- if (blkno < bp->b_bn)
- rbp = &(*rbp)->rb_left;
- else if (blkno > bp->b_bn)
- rbp = &(*rbp)->rb_right;
- else {
- /*
- * found a block number match. If the range doesn't
- * match, the only way this is allowed is if the buffer
- * in the cache is stale and the transaction that made
- * it stale has not yet committed. i.e. we are
- * reallocating a busy extent. Skip this buffer and
- * continue searching to the right for an exact match.
- */
- if (bp->b_length != numblks) {
- ASSERT(bp->b_flags & XBF_STALE);
- rbp = &(*rbp)->rb_right;
- continue;
- }
- atomic_inc(&bp->b_hold);
- goto found;
- }
+ bp = rhashtable_lookup_fast(&pag->pag_buf_hash, &cmap,
+ xfs_buf_hash_params);
+ if (bp) {
+ atomic_inc(&bp->b_hold);
+ goto found;
}
/* No match found */
if (new_bp) {
- rb_link_node(&new_bp->b_rbnode, parent, rbp);
- rb_insert_color(&new_bp->b_rbnode, &pag->pag_buf_tree);
/* the buffer keeps the perag reference until it is freed */
new_bp->b_pag = pag;
+ rhashtable_insert_fast(&pag->pag_buf_hash,
+ &new_bp->b_rhash_head,
+ xfs_buf_hash_params);
spin_unlock(&pag->pag_buf_lock);
} else {
XFS_STATS_INC(btp->bt_mount, xb_miss_locked);
@@ -930,7 +958,6 @@ xfs_buf_rele(
if (!pag) {
ASSERT(list_empty(&bp->b_lru));
- ASSERT(RB_EMPTY_NODE(&bp->b_rbnode));
if (atomic_dec_and_test(&bp->b_hold)) {
xfs_buf_ioacct_dec(bp);
xfs_buf_free(bp);
@@ -938,8 +965,6 @@ xfs_buf_rele(
return;
}
- ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode));
-
ASSERT(atomic_read(&bp->b_hold) > 0);
release = atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock);
@@ -983,7 +1008,8 @@ xfs_buf_rele(
}
ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
- rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
+ rhashtable_remove_fast(&pag->pag_buf_hash, &bp->b_rhash_head,
+ xfs_buf_hash_params);
spin_unlock(&pag->pag_buf_lock);
xfs_perag_put(pag);
freebuf = true;
@@ -1304,7 +1330,7 @@ _xfs_buf_ioapply(
if (bp->b_flags & XBF_WRITE) {
op = REQ_OP_WRITE;
if (bp->b_flags & XBF_SYNCIO)
- op_flags = WRITE_SYNC;
+ op_flags = REQ_SYNC;
if (bp->b_flags & XBF_FUA)
op_flags |= REQ_FUA;
if (bp->b_flags & XBF_FLUSH)
@@ -1711,8 +1737,7 @@ xfs_free_buftarg(
percpu_counter_destroy(&btp->bt_io_count);
list_lru_destroy(&btp->bt_lru);
- if (mp->m_flags & XFS_MOUNT_BARRIER)
- xfs_blkdev_issue_flush(btp);
+ xfs_blkdev_issue_flush(btp);
kmem_free(btp);
}
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 1c2e52b2d926..8a9d3a9599f0 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -71,6 +71,7 @@ typedef unsigned int xfs_buf_flags_t;
{ XBF_READ, "READ" }, \
{ XBF_WRITE, "WRITE" }, \
{ XBF_READ_AHEAD, "READ_AHEAD" }, \
+ { XBF_NO_IOACCT, "NO_IOACCT" }, \
{ XBF_ASYNC, "ASYNC" }, \
{ XBF_DONE, "DONE" }, \
{ XBF_STALE, "STALE" }, \
@@ -150,7 +151,7 @@ typedef struct xfs_buf {
* which is the only bit that is touched if we hit the semaphore
* fast-path on locking.
*/
- struct rb_node b_rbnode; /* rbtree node */
+ struct rhash_head b_rhash_head; /* pag buffer hash node */
xfs_daddr_t b_bn; /* block number of buffer */
int b_length; /* size of buffer in BBs */
atomic_t b_hold; /* reference count */
diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c
index 29816981b50a..003a99b83bd8 100644
--- a/fs/xfs/xfs_dir2_readdir.c
+++ b/fs/xfs/xfs_dir2_readdir.c
@@ -677,7 +677,6 @@ xfs_readdir(
args.dp = dp;
args.geo = dp->i_mount->m_dir_geo;
- xfs_ilock(dp, XFS_IOLOCK_SHARED);
if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
rval = xfs_dir2_sf_getdents(&args, ctx);
else if ((rval = xfs_dir2_isblock(&args, &v)))
@@ -686,7 +685,6 @@ xfs_readdir(
rval = xfs_dir2_block_getdents(&args, ctx);
else
rval = xfs_dir2_leaf_getdents(&args, ctx, bufsize);
- xfs_iunlock(dp, XFS_IOLOCK_SHARED);
return rval;
}
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 6e4f7f900fea..65d27a502909 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -48,40 +48,6 @@
static const struct vm_operations_struct xfs_file_vm_ops;
/*
- * Locking primitives for read and write IO paths to ensure we consistently use
- * and order the inode->i_mutex, ip->i_lock and ip->i_iolock.
- */
-static inline void
-xfs_rw_ilock(
- struct xfs_inode *ip,
- int type)
-{
- if (type & XFS_IOLOCK_EXCL)
- inode_lock(VFS_I(ip));
- xfs_ilock(ip, type);
-}
-
-static inline void
-xfs_rw_iunlock(
- struct xfs_inode *ip,
- int type)
-{
- xfs_iunlock(ip, type);
- if (type & XFS_IOLOCK_EXCL)
- inode_unlock(VFS_I(ip));
-}
-
-static inline void
-xfs_rw_ilock_demote(
- struct xfs_inode *ip,
- int type)
-{
- xfs_ilock_demote(ip, type);
- if (type & XFS_IOLOCK_EXCL)
- inode_unlock(VFS_I(ip));
-}
-
-/*
* Clear the specified ranges to zero through either the pagecache or DAX.
* Holes and unwritten extents will be left as-is as they already are zeroed.
*/
@@ -183,19 +149,16 @@ xfs_file_fsync(
xfs_iflags_clear(ip, XFS_ITRUNCATED);
- if (mp->m_flags & XFS_MOUNT_BARRIER) {
- /*
- * If we have an RT and/or log subvolume we need to make sure
- * to flush the write cache the device used for file data
- * first. This is to ensure newly written file data make
- * it to disk before logging the new inode size in case of
- * an extending write.
- */
- if (XFS_IS_REALTIME_INODE(ip))
- xfs_blkdev_issue_flush(mp->m_rtdev_targp);
- else if (mp->m_logdev_targp != mp->m_ddev_targp)
- xfs_blkdev_issue_flush(mp->m_ddev_targp);
- }
+ /*
+ * If we have an RT and/or log subvolume we need to make sure to flush
+ * the write cache the device used for file data first. This is to
+ * ensure newly written file data make it to disk before logging the new
+ * inode size in case of an extending write.
+ */
+ if (XFS_IS_REALTIME_INODE(ip))
+ xfs_blkdev_issue_flush(mp->m_rtdev_targp);
+ else if (mp->m_logdev_targp != mp->m_ddev_targp)
+ xfs_blkdev_issue_flush(mp->m_ddev_targp);
/*
* All metadata updates are logged, which means that we just have to
@@ -230,10 +193,8 @@ xfs_file_fsync(
* an already allocated file and thus do not have any metadata to
* commit.
*/
- if ((mp->m_flags & XFS_MOUNT_BARRIER) &&
- mp->m_logdev_targp == mp->m_ddev_targp &&
- !XFS_IS_REALTIME_INODE(ip) &&
- !log_flushed)
+ if (!log_flushed && !XFS_IS_REALTIME_INODE(ip) &&
+ mp->m_logdev_targp == mp->m_ddev_targp)
xfs_blkdev_issue_flush(mp->m_ddev_targp);
return error;
@@ -244,62 +205,21 @@ xfs_file_dio_aio_read(
struct kiocb *iocb,
struct iov_iter *to)
{
- struct address_space *mapping = iocb->ki_filp->f_mapping;
- struct inode *inode = mapping->host;
- struct xfs_inode *ip = XFS_I(inode);
- loff_t isize = i_size_read(inode);
+ struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
size_t count = iov_iter_count(to);
- loff_t end = iocb->ki_pos + count - 1;
- struct iov_iter data;
- struct xfs_buftarg *target;
- ssize_t ret = 0;
+ ssize_t ret;
trace_xfs_file_direct_read(ip, count, iocb->ki_pos);
if (!count)
return 0; /* skip atime */
- if (XFS_IS_REALTIME_INODE(ip))
- target = ip->i_mount->m_rtdev_targp;
- else
- target = ip->i_mount->m_ddev_targp;
-
- /* DIO must be aligned to device logical sector size */
- if ((iocb->ki_pos | count) & target->bt_logical_sectormask) {
- if (iocb->ki_pos == isize)
- return 0;
- return -EINVAL;
- }
-
file_accessed(iocb->ki_filp);
- xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
- if (mapping->nrpages) {
- ret = filemap_write_and_wait_range(mapping, iocb->ki_pos, end);
- if (ret)
- goto out_unlock;
-
- /*
- * Invalidate whole pages. This can return an error if we fail
- * to invalidate a page, but this should never happen on XFS.
- * Warn if it does fail.
- */
- ret = invalidate_inode_pages2_range(mapping,
- iocb->ki_pos >> PAGE_SHIFT, end >> PAGE_SHIFT);
- WARN_ON_ONCE(ret);
- ret = 0;
- }
+ xfs_ilock(ip, XFS_IOLOCK_SHARED);
+ ret = iomap_dio_rw(iocb, to, &xfs_iomap_ops, NULL);
+ xfs_iunlock(ip, XFS_IOLOCK_SHARED);
- data = *to;
- ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data,
- xfs_get_blocks_direct, NULL, NULL, 0);
- if (ret >= 0) {
- iocb->ki_pos += ret;
- iov_iter_advance(to, ret);
- }
-
-out_unlock:
- xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
return ret;
}
@@ -317,9 +237,9 @@ xfs_file_dax_read(
if (!count)
return 0; /* skip atime */
- xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
- ret = iomap_dax_rw(iocb, to, &xfs_iomap_ops);
- xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
+ xfs_ilock(ip, XFS_IOLOCK_SHARED);
+ ret = dax_iomap_rw(iocb, to, &xfs_iomap_ops);
+ xfs_iunlock(ip, XFS_IOLOCK_SHARED);
file_accessed(iocb->ki_filp);
return ret;
@@ -335,9 +255,9 @@ xfs_file_buffered_aio_read(
trace_xfs_file_buffered_read(ip, iov_iter_count(to), iocb->ki_pos);
- xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
+ xfs_ilock(ip, XFS_IOLOCK_SHARED);
ret = generic_file_read_iter(iocb, to);
- xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
+ xfs_iunlock(ip, XFS_IOLOCK_SHARED);
return ret;
}
@@ -418,15 +338,18 @@ restart:
if (error <= 0)
return error;
- error = xfs_break_layouts(inode, iolock, true);
+ error = xfs_break_layouts(inode, iolock);
if (error)
return error;
- /* For changing security info in file_remove_privs() we need i_mutex */
+ /*
+ * For changing security info in file_remove_privs() we need i_rwsem
+ * exclusively.
+ */
if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) {
- xfs_rw_iunlock(ip, *iolock);
+ xfs_iunlock(ip, *iolock);
*iolock = XFS_IOLOCK_EXCL;
- xfs_rw_ilock(ip, *iolock);
+ xfs_ilock(ip, *iolock);
goto restart;
}
/*
@@ -451,9 +374,9 @@ restart:
spin_unlock(&ip->i_flags_lock);
if (!drained_dio) {
if (*iolock == XFS_IOLOCK_SHARED) {
- xfs_rw_iunlock(ip, *iolock);
+ xfs_iunlock(ip, *iolock);
*iolock = XFS_IOLOCK_EXCL;
- xfs_rw_ilock(ip, *iolock);
+ xfs_ilock(ip, *iolock);
iov_iter_reexpand(from, count);
}
/*
@@ -496,6 +419,58 @@ restart:
return 0;
}
+static int
+xfs_dio_write_end_io(
+ struct kiocb *iocb,
+ ssize_t size,
+ unsigned flags)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+ struct xfs_inode *ip = XFS_I(inode);
+ loff_t offset = iocb->ki_pos;
+ bool update_size = false;
+ int error = 0;
+
+ trace_xfs_end_io_direct_write(ip, offset, size);
+
+ if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+ return -EIO;
+
+ if (size <= 0)
+ return size;
+
+ /*
+ * We need to update the in-core inode size here so that we don't end up
+ * with the on-disk inode size being outside the in-core inode size. We
+ * have no other method of updating EOF for AIO, so always do it here
+ * if necessary.
+ *
+ * We need to lock the test/set EOF update as we can be racing with
+ * other IO completions here to update the EOF. Failing to serialise
+ * here can result in EOF moving backwards and Bad Things Happen when
+ * that occurs.
+ */
+ spin_lock(&ip->i_flags_lock);
+ if (offset + size > i_size_read(inode)) {
+ i_size_write(inode, offset + size);
+ update_size = true;
+ }
+ spin_unlock(&ip->i_flags_lock);
+
+ if (flags & IOMAP_DIO_COW) {
+ error = xfs_reflink_end_cow(ip, offset, size);
+ if (error)
+ return error;
+ }
+
+ if (flags & IOMAP_DIO_UNWRITTEN)
+ error = xfs_iomap_write_unwritten(ip, offset, size);
+ else if (update_size)
+ error = xfs_setfilesize(ip, offset, size);
+
+ return error;
+}
+
/*
* xfs_file_dio_aio_write - handle direct IO writes
*
@@ -535,9 +510,7 @@ xfs_file_dio_aio_write(
int unaligned_io = 0;
int iolock;
size_t count = iov_iter_count(from);
- loff_t end;
- struct iov_iter data;
- struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ?
+ struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ?
mp->m_rtdev_targp : mp->m_ddev_targp;
/* DIO must be aligned to device logical sector size */
@@ -559,29 +532,12 @@ xfs_file_dio_aio_write(
iolock = XFS_IOLOCK_SHARED;
}
- xfs_rw_ilock(ip, iolock);
+ xfs_ilock(ip, iolock);
ret = xfs_file_aio_write_checks(iocb, from, &iolock);
if (ret)
goto out;
count = iov_iter_count(from);
- end = iocb->ki_pos + count - 1;
-
- if (mapping->nrpages) {
- ret = filemap_write_and_wait_range(mapping, iocb->ki_pos, end);
- if (ret)
- goto out;
-
- /*
- * Invalidate whole pages. This can return an error if we fail
- * to invalidate a page, but this should never happen on XFS.
- * Warn if it does fail.
- */
- ret = invalidate_inode_pages2_range(mapping,
- iocb->ki_pos >> PAGE_SHIFT, end >> PAGE_SHIFT);
- WARN_ON_ONCE(ret);
- ret = 0;
- }
/*
* If we are doing unaligned IO, wait for all other IO to drain,
@@ -591,7 +547,7 @@ xfs_file_dio_aio_write(
if (unaligned_io)
inode_dio_wait(inode);
else if (iolock == XFS_IOLOCK_EXCL) {
- xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
+ xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
iolock = XFS_IOLOCK_SHARED;
}
@@ -604,24 +560,9 @@ xfs_file_dio_aio_write(
goto out;
}
- data = *from;
- ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data,
- xfs_get_blocks_direct, xfs_end_io_direct_write,
- NULL, DIO_ASYNC_EXTEND);
-
- /* see generic_file_direct_write() for why this is necessary */
- if (mapping->nrpages) {
- invalidate_inode_pages2_range(mapping,
- iocb->ki_pos >> PAGE_SHIFT,
- end >> PAGE_SHIFT);
- }
-
- if (ret > 0) {
- iocb->ki_pos += ret;
- iov_iter_advance(from, ret);
- }
+ ret = iomap_dio_rw(iocb, from, &xfs_iomap_ops, xfs_dio_write_end_io);
out:
- xfs_rw_iunlock(ip, iolock);
+ xfs_iunlock(ip, iolock);
/*
* No fallback to buffered IO on errors for XFS, direct IO will either
@@ -643,7 +584,7 @@ xfs_file_dax_write(
size_t count;
loff_t pos;
- xfs_rw_ilock(ip, iolock);
+ xfs_ilock(ip, iolock);
ret = xfs_file_aio_write_checks(iocb, from, &iolock);
if (ret)
goto out;
@@ -652,15 +593,13 @@ xfs_file_dax_write(
count = iov_iter_count(from);
trace_xfs_file_dax_write(ip, count, pos);
-
- ret = iomap_dax_rw(iocb, from, &xfs_iomap_ops);
+ ret = dax_iomap_rw(iocb, from, &xfs_iomap_ops);
if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
i_size_write(inode, iocb->ki_pos);
error = xfs_setfilesize(ip, pos, ret);
}
-
out:
- xfs_rw_iunlock(ip, iolock);
+ xfs_iunlock(ip, iolock);
return error ? error : ret;
}
@@ -677,7 +616,7 @@ xfs_file_buffered_aio_write(
int enospc = 0;
int iolock = XFS_IOLOCK_EXCL;
- xfs_rw_ilock(ip, iolock);
+ xfs_ilock(ip, iolock);
ret = xfs_file_aio_write_checks(iocb, from, &iolock);
if (ret)
@@ -721,7 +660,7 @@ write_retry:
current->backing_dev_info = NULL;
out:
- xfs_rw_iunlock(ip, iolock);
+ xfs_iunlock(ip, iolock);
return ret;
}
@@ -797,7 +736,7 @@ xfs_file_fallocate(
return -EOPNOTSUPP;
xfs_ilock(ip, iolock);
- error = xfs_break_layouts(inode, &iolock, false);
+ error = xfs_break_layouts(inode, &iolock);
if (error)
goto out_unlock;
@@ -939,7 +878,6 @@ xfs_file_clone_range(
len, false);
}
-#define XFS_MAX_DEDUPE_LEN (16 * 1024 * 1024)
STATIC ssize_t
xfs_file_dedupe_range(
struct file *src_file,
@@ -950,14 +888,6 @@ xfs_file_dedupe_range(
{
int error;
- /*
- * Limit the total length we will dedupe for each operation.
- * This is intended to bound the total time spent in this
- * ioctl to something sane.
- */
- if (len > XFS_MAX_DEDUPE_LEN)
- len = XFS_MAX_DEDUPE_LEN;
-
error = xfs_reflink_remap_range(src_file, loff, dst_file, dst_loff,
len, true);
if (error)
@@ -1474,7 +1404,7 @@ xfs_filemap_page_mkwrite(
xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
if (IS_DAX(inode)) {
- ret = iomap_dax_fault(vma, vmf, &xfs_iomap_ops);
+ ret = dax_iomap_fault(vma, vmf, &xfs_iomap_ops);
} else {
ret = iomap_page_mkwrite(vma, vmf, &xfs_iomap_ops);
ret = block_page_mkwrite_return(ret);
@@ -1501,15 +1431,9 @@ xfs_filemap_fault(
return xfs_filemap_page_mkwrite(vma, vmf);
xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
- if (IS_DAX(inode)) {
- /*
- * we do not want to trigger unwritten extent conversion on read
- * faults - that is unnecessary overhead and would also require
- * changes to xfs_get_blocks_direct() to map unwritten extent
- * ioend for conversion on read-only mappings.
- */
- ret = iomap_dax_fault(vma, vmf, &xfs_iomap_ops);
- } else
+ if (IS_DAX(inode))
+ ret = dax_iomap_fault(vma, vmf, &xfs_iomap_ops);
+ else
ret = filemap_fault(vma, vmf);
xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
@@ -1545,7 +1469,7 @@ xfs_filemap_pmd_fault(
}
xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
- ret = dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_dax_fault);
+ ret = dax_iomap_pmd_fault(vma, addr, pmd, flags, &xfs_iomap_ops);
xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
if (flags & FAULT_FLAG_WRITE)
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index f295049db681..ff4d6311c7f4 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -70,8 +70,6 @@ xfs_inode_alloc(
ASSERT(!xfs_isiflocked(ip));
ASSERT(ip->i_ino == 0);
- mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
-
/* initialise the xfs inode */
ip->i_ino = ino;
ip->i_mount = mp;
@@ -123,7 +121,6 @@ __xfs_inode_free(
{
/* asserts to verify all state is correct here */
ASSERT(atomic_read(&ip->i_pincount) == 0);
- ASSERT(!xfs_isiflocked(ip));
XFS_STATS_DEC(ip->i_mount, vn_active);
call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
@@ -133,6 +130,8 @@ void
xfs_inode_free(
struct xfs_inode *ip)
{
+ ASSERT(!xfs_isiflocked(ip));
+
/*
* Because we use RCU freeing we need to ensure the inode always
* appears to be reclaimed with an invalid inode number when in the
@@ -393,8 +392,8 @@ xfs_iget_cache_hit(
xfs_inode_clear_reclaim_tag(pag, ip->i_ino);
inode->i_state = I_NEW;
- ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
- mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
+ ASSERT(!rwsem_is_locked(&inode->i_rwsem));
+ init_rwsem(&inode->i_rwsem);
spin_unlock(&ip->i_flags_lock);
spin_unlock(&pag->pag_ici_lock);
@@ -981,6 +980,7 @@ restart:
if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
xfs_iunpin_wait(ip);
+ /* xfs_iflush_abort() drops the flush lock */
xfs_iflush_abort(ip, false);
goto reclaim;
}
@@ -989,10 +989,10 @@ restart:
goto out_ifunlock;
xfs_iunpin_wait(ip);
}
- if (xfs_iflags_test(ip, XFS_ISTALE))
- goto reclaim;
- if (xfs_inode_clean(ip))
+ if (xfs_iflags_test(ip, XFS_ISTALE) || xfs_inode_clean(ip)) {
+ xfs_ifunlock(ip);
goto reclaim;
+ }
/*
* Never flush out dirty data during non-blocking reclaim, as it would
@@ -1030,25 +1030,24 @@ restart:
xfs_buf_relse(bp);
}
- xfs_iflock(ip);
reclaim:
+ ASSERT(!xfs_isiflocked(ip));
+
/*
* Because we use RCU freeing we need to ensure the inode always appears
* to be reclaimed with an invalid inode number when in the free state.
- * We do this as early as possible under the ILOCK and flush lock so
- * that xfs_iflush_cluster() can be guaranteed to detect races with us
- * here. By doing this, we guarantee that once xfs_iflush_cluster has
- * locked both the XFS_ILOCK and the flush lock that it will see either
- * a valid, flushable inode that will serialise correctly against the
- * locks below, or it will see a clean (and invalid) inode that it can
- * skip.
+ * We do this as early as possible under the ILOCK so that
+ * xfs_iflush_cluster() can be guaranteed to detect races with us here.
+ * By doing this, we guarantee that once xfs_iflush_cluster has locked
+ * XFS_ILOCK that it will see either a valid, flushable inode that will
+ * serialise correctly, or it will see a clean (and invalid) inode that
+ * it can skip.
*/
spin_lock(&ip->i_flags_lock);
ip->i_flags = XFS_IRECLAIM;
ip->i_ino = 0;
spin_unlock(&ip->i_flags_lock);
- xfs_ifunlock(ip);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
XFS_STATS_INC(ip->i_mount, xs_ig_reclaims);
@@ -1580,10 +1579,15 @@ xfs_inode_free_cowblocks(
struct xfs_eofblocks *eofb = args;
bool need_iolock = true;
int match;
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
ASSERT(!eofb || (eofb && eofb->eof_scan_owner != 0));
- if (!xfs_reflink_has_real_cow_blocks(ip)) {
+ /*
+ * Just clear the tag if we have an empty cow fork or none at all. It's
+ * possible the inode was fully unshared since it was originally tagged.
+ */
+ if (!xfs_is_reflink_inode(ip) || !ifp->if_bytes) {
trace_xfs_inode_free_cowblocks_invalid(ip);
xfs_inode_clear_cowblocks_tag(ip);
return 0;
diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c
index d45ca72af6fb..865ad1373e5e 100644
--- a/fs/xfs/xfs_icreate_item.c
+++ b/fs/xfs/xfs_icreate_item.c
@@ -133,7 +133,7 @@ xfs_icreate_item_committing(
/*
* This is the ops vector shared by all buf log items.
*/
-static struct xfs_item_ops xfs_icreate_item_ops = {
+static const struct xfs_item_ops xfs_icreate_item_ops = {
.iop_size = xfs_icreate_item_size,
.iop_format = xfs_icreate_item_format,
.iop_pin = xfs_icreate_item_pin,
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 4e560e6a12c1..b9557795eb74 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -142,31 +142,31 @@ xfs_ilock_attr_map_shared(
}
/*
- * The xfs inode contains 3 multi-reader locks: the i_iolock the i_mmap_lock and
- * the i_lock. This routine allows various combinations of the locks to be
- * obtained.
+ * In addition to i_rwsem in the VFS inode, the xfs inode contains 2
+ * multi-reader locks: i_mmap_lock and the i_lock. This routine allows
+ * various combinations of the locks to be obtained.
*
* The 3 locks should always be ordered so that the IO lock is obtained first,
* the mmap lock second and the ilock last in order to prevent deadlock.
*
* Basic locking order:
*
- * i_iolock -> i_mmap_lock -> page_lock -> i_ilock
+ * i_rwsem -> i_mmap_lock -> page_lock -> i_ilock
*
* mmap_sem locking order:
*
- * i_iolock -> page lock -> mmap_sem
+ * i_rwsem -> page lock -> mmap_sem
* mmap_sem -> i_mmap_lock -> page_lock
*
* The difference in mmap_sem locking order mean that we cannot hold the
* i_mmap_lock over syscall based read(2)/write(2) based IO. These IO paths can
* fault in pages during copy in/out (for buffered IO) or require the mmap_sem
* in get_user_pages() to map the user pages into the kernel address space for
- * direct IO. Similarly the i_iolock cannot be taken inside a page fault because
+ * direct IO. Similarly the i_rwsem cannot be taken inside a page fault because
* page faults already hold the mmap_sem.
*
* Hence to serialise fully against both syscall and mmap based IO, we need to
- * take both the i_iolock and the i_mmap_lock. These locks should *only* be both
+ * take both the i_rwsem and the i_mmap_lock. These locks should *only* be both
* taken in places where we need to invalidate the page cache in a race
* free manner (e.g. truncate, hole punch and other extent manipulation
* functions).
@@ -191,10 +191,13 @@ xfs_ilock(
(XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0);
- if (lock_flags & XFS_IOLOCK_EXCL)
- mrupdate_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
- else if (lock_flags & XFS_IOLOCK_SHARED)
- mraccess_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
+ if (lock_flags & XFS_IOLOCK_EXCL) {
+ down_write_nested(&VFS_I(ip)->i_rwsem,
+ XFS_IOLOCK_DEP(lock_flags));
+ } else if (lock_flags & XFS_IOLOCK_SHARED) {
+ down_read_nested(&VFS_I(ip)->i_rwsem,
+ XFS_IOLOCK_DEP(lock_flags));
+ }
if (lock_flags & XFS_MMAPLOCK_EXCL)
mrupdate_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags));
@@ -240,10 +243,10 @@ xfs_ilock_nowait(
ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0);
if (lock_flags & XFS_IOLOCK_EXCL) {
- if (!mrtryupdate(&ip->i_iolock))
+ if (!down_write_trylock(&VFS_I(ip)->i_rwsem))
goto out;
} else if (lock_flags & XFS_IOLOCK_SHARED) {
- if (!mrtryaccess(&ip->i_iolock))
+ if (!down_read_trylock(&VFS_I(ip)->i_rwsem))
goto out;
}
@@ -271,9 +274,9 @@ out_undo_mmaplock:
mrunlock_shared(&ip->i_mmaplock);
out_undo_iolock:
if (lock_flags & XFS_IOLOCK_EXCL)
- mrunlock_excl(&ip->i_iolock);
+ up_write(&VFS_I(ip)->i_rwsem);
else if (lock_flags & XFS_IOLOCK_SHARED)
- mrunlock_shared(&ip->i_iolock);
+ up_read(&VFS_I(ip)->i_rwsem);
out:
return 0;
}
@@ -310,9 +313,9 @@ xfs_iunlock(
ASSERT(lock_flags != 0);
if (lock_flags & XFS_IOLOCK_EXCL)
- mrunlock_excl(&ip->i_iolock);
+ up_write(&VFS_I(ip)->i_rwsem);
else if (lock_flags & XFS_IOLOCK_SHARED)
- mrunlock_shared(&ip->i_iolock);
+ up_read(&VFS_I(ip)->i_rwsem);
if (lock_flags & XFS_MMAPLOCK_EXCL)
mrunlock_excl(&ip->i_mmaplock);
@@ -345,7 +348,7 @@ xfs_ilock_demote(
if (lock_flags & XFS_MMAPLOCK_EXCL)
mrdemote(&ip->i_mmaplock);
if (lock_flags & XFS_IOLOCK_EXCL)
- mrdemote(&ip->i_iolock);
+ downgrade_write(&VFS_I(ip)->i_rwsem);
trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_);
}
@@ -370,8 +373,9 @@ xfs_isilocked(
if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) {
if (!(lock_flags & XFS_IOLOCK_SHARED))
- return !!ip->i_iolock.mr_writer;
- return rwsem_is_locked(&ip->i_iolock.mr_lock);
+ return !debug_locks ||
+ lockdep_is_held_type(&VFS_I(ip)->i_rwsem, 0);
+ return rwsem_is_locked(&VFS_I(ip)->i_rwsem);
}
ASSERT(0);
@@ -421,11 +425,7 @@ xfs_lock_inumorder(int lock_mode, int subclass)
if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) {
ASSERT(subclass <= XFS_IOLOCK_MAX_SUBCLASS);
- ASSERT(xfs_lockdep_subclass_ok(subclass +
- XFS_IOLOCK_PARENT_VAL));
class += subclass << XFS_IOLOCK_SHIFT;
- if (lock_mode & XFS_IOLOCK_PARENT)
- class += XFS_IOLOCK_PARENT_VAL << XFS_IOLOCK_SHIFT;
}
if (lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) {
@@ -477,8 +477,6 @@ xfs_lock_inodes(
XFS_ILOCK_EXCL));
ASSERT(!(lock_mode & (XFS_IOLOCK_SHARED | XFS_MMAPLOCK_SHARED |
XFS_ILOCK_SHARED)));
- ASSERT(!(lock_mode & XFS_IOLOCK_EXCL) ||
- inodes <= XFS_IOLOCK_MAX_SUBCLASS + 1);
ASSERT(!(lock_mode & XFS_MMAPLOCK_EXCL) ||
inodes <= XFS_MMAPLOCK_MAX_SUBCLASS + 1);
ASSERT(!(lock_mode & XFS_ILOCK_EXCL) ||
@@ -581,10 +579,8 @@ xfs_lock_two_inodes(
int attempts = 0;
xfs_log_item_t *lp;
- if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) {
- ASSERT(!(lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)));
- ASSERT(!(lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)));
- } else if (lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL))
+ ASSERT(!(lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)));
+ if (lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL))
ASSERT(!(lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)));
ASSERT(ip0->i_ino != ip1->i_ino);
@@ -715,7 +711,6 @@ xfs_lookup(
if (XFS_FORCED_SHUTDOWN(dp->i_mount))
return -EIO;
- xfs_ilock(dp, XFS_IOLOCK_SHARED);
error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name);
if (error)
goto out_unlock;
@@ -724,14 +719,12 @@ xfs_lookup(
if (error)
goto out_free_name;
- xfs_iunlock(dp, XFS_IOLOCK_SHARED);
return 0;
out_free_name:
if (ci_name)
kmem_free(ci_name->name);
out_unlock:
- xfs_iunlock(dp, XFS_IOLOCK_SHARED);
*ipp = NULL;
return error;
}
@@ -1215,8 +1208,7 @@ xfs_create(
if (error)
goto out_release_inode;
- xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL |
- XFS_IOLOCK_PARENT | XFS_ILOCK_PARENT);
+ xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
unlock_dp_on_error = true;
xfs_defer_init(&dfops, &first_block);
@@ -1252,7 +1244,7 @@ xfs_create(
* the transaction cancel unlocking dp so don't do it explicitly in the
* error path.
*/
- xfs_trans_ijoin(tp, dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
unlock_dp_on_error = false;
error = xfs_dir_createname(tp, dp, name, ip->i_ino,
@@ -1325,7 +1317,7 @@ xfs_create(
xfs_qm_dqrele(pdqp);
if (unlock_dp_on_error)
- xfs_iunlock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
+ xfs_iunlock(dp, XFS_ILOCK_EXCL);
return error;
}
@@ -1466,11 +1458,10 @@ xfs_link(
if (error)
goto std_return;
- xfs_ilock(tdp, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT);
xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, tdp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL);
/*
* If we are using project inheritance, we only allow hard link
@@ -2041,7 +2032,6 @@ xfs_iunlink(
agi->agi_unlinked[bucket_index] = cpu_to_be32(agino);
offset = offsetof(xfs_agi_t, agi_unlinked) +
(sizeof(xfs_agino_t) * bucket_index);
- xfs_trans_buf_set_type(tp, agibp, XFS_BLFT_AGI_BUF);
xfs_trans_log_buf(tp, agibp, offset,
(offset + sizeof(xfs_agino_t) - 1));
return 0;
@@ -2133,7 +2123,6 @@ xfs_iunlink_remove(
agi->agi_unlinked[bucket_index] = cpu_to_be32(next_agino);
offset = offsetof(xfs_agi_t, agi_unlinked) +
(sizeof(xfs_agino_t) * bucket_index);
- xfs_trans_buf_set_type(tp, agibp, XFS_BLFT_AGI_BUF);
xfs_trans_log_buf(tp, agibp, offset,
(offset + sizeof(xfs_agino_t) - 1));
} else {
@@ -2579,10 +2568,9 @@ xfs_remove(
goto std_return;
}
- xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT);
xfs_lock_two_inodes(dp, ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
/*
@@ -2963,12 +2951,6 @@ xfs_rename(
* whether the target directory is the same as the source
* directory, we can lock from 2 to 4 inodes.
*/
- if (!new_parent)
- xfs_ilock(src_dp, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT);
- else
- xfs_lock_two_inodes(src_dp, target_dp,
- XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT);
-
xfs_lock_inodes(inodes, num_inodes, XFS_ILOCK_EXCL);
/*
@@ -2976,9 +2958,9 @@ xfs_rename(
* we can rely on either trans_commit or trans_cancel to unlock
* them.
*/
- xfs_trans_ijoin(tp, src_dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL);
if (new_parent)
- xfs_trans_ijoin(tp, target_dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL);
if (target_ip)
xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL);
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index f14c1de2549d..10dcf27b4c85 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -56,7 +56,6 @@ typedef struct xfs_inode {
/* Transaction and locking information. */
struct xfs_inode_log_item *i_itemp; /* logging information */
mrlock_t i_lock; /* inode lock */
- mrlock_t i_iolock; /* inode IO lock */
mrlock_t i_mmaplock; /* inode mmap IO lock */
atomic_t i_pincount; /* inode pin count */
spinlock_t i_flags_lock; /* inode i_flags lock */
@@ -246,6 +245,11 @@ static inline bool xfs_is_reflink_inode(struct xfs_inode *ip)
* Synchronize processes attempting to flush the in-core inode back to disk.
*/
+static inline int xfs_isiflocked(struct xfs_inode *ip)
+{
+ return xfs_iflags_test(ip, XFS_IFLOCK);
+}
+
extern void __xfs_iflock(struct xfs_inode *ip);
static inline int xfs_iflock_nowait(struct xfs_inode *ip)
@@ -261,16 +265,12 @@ static inline void xfs_iflock(struct xfs_inode *ip)
static inline void xfs_ifunlock(struct xfs_inode *ip)
{
+ ASSERT(xfs_isiflocked(ip));
xfs_iflags_clear(ip, XFS_IFLOCK);
smp_mb();
wake_up_bit(&ip->i_flags, __XFS_IFLOCK_BIT);
}
-static inline int xfs_isiflocked(struct xfs_inode *ip)
-{
- return xfs_iflags_test(ip, XFS_IFLOCK);
-}
-
/*
* Flags for inode locking.
* Bit ranges: 1<<1 - 1<<16-1 -- iolock/ilock modes (bitfield)
@@ -332,7 +332,7 @@ static inline int xfs_isiflocked(struct xfs_inode *ip)
* IOLOCK values
*
* 0-3 subclass value
- * 4-7 PARENT subclass values
+ * 4-7 unused
*
* MMAPLOCK values
*
@@ -347,10 +347,8 @@ static inline int xfs_isiflocked(struct xfs_inode *ip)
*
*/
#define XFS_IOLOCK_SHIFT 16
-#define XFS_IOLOCK_PARENT_VAL 4
-#define XFS_IOLOCK_MAX_SUBCLASS (XFS_IOLOCK_PARENT_VAL - 1)
+#define XFS_IOLOCK_MAX_SUBCLASS 3
#define XFS_IOLOCK_DEP_MASK 0x000f0000
-#define XFS_IOLOCK_PARENT (XFS_IOLOCK_PARENT_VAL << XFS_IOLOCK_SHIFT)
#define XFS_MMAPLOCK_SHIFT 20
#define XFS_MMAPLOCK_NUMORDER 0
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 9610e9c00952..d90e7811ccdd 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -164,7 +164,7 @@ xfs_inode_item_format_data_fork(
struct xfs_bmbt_rec *p;
ASSERT(ip->i_df.if_u1.if_extents != NULL);
- ASSERT(ip->i_df.if_bytes / sizeof(xfs_bmbt_rec_t) > 0);
+ ASSERT(xfs_iext_count(&ip->i_df) > 0);
p = xlog_prepare_iovec(lv, vecp, XLOG_REG_TYPE_IEXT);
data_bytes = xfs_iextents_copy(ip, p, XFS_DATA_FORK);
@@ -261,7 +261,7 @@ xfs_inode_item_format_attr_fork(
ip->i_afp->if_bytes > 0) {
struct xfs_bmbt_rec *p;
- ASSERT(ip->i_afp->if_bytes / sizeof(xfs_bmbt_rec_t) ==
+ ASSERT(xfs_iext_count(ip->i_afp) ==
ip->i_d.di_anextents);
ASSERT(ip->i_afp->if_u1.if_extents != NULL);
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index c245bed3249b..fc563b82aea6 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -639,7 +639,7 @@ xfs_ioc_space(
return error;
xfs_ilock(ip, iolock);
- error = xfs_break_layouts(inode, &iolock, false);
+ error = xfs_break_layouts(inode, &iolock);
if (error)
goto out_unlock;
@@ -910,16 +910,14 @@ xfs_ioc_fsgetxattr(
if (attr) {
if (ip->i_afp) {
if (ip->i_afp->if_flags & XFS_IFEXTENTS)
- fa.fsx_nextents = ip->i_afp->if_bytes /
- sizeof(xfs_bmbt_rec_t);
+ fa.fsx_nextents = xfs_iext_count(ip->i_afp);
else
fa.fsx_nextents = ip->i_d.di_anextents;
} else
fa.fsx_nextents = 0;
} else {
if (ip->i_df.if_flags & XFS_IFEXTENTS)
- fa.fsx_nextents = ip->i_df.if_bytes /
- sizeof(xfs_bmbt_rec_t);
+ fa.fsx_nextents = xfs_iext_count(&ip->i_df);
else
fa.fsx_nextents = ip->i_d.di_nextents;
}
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 436e109bb01e..0d147428971e 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -395,11 +395,12 @@ xfs_iomap_prealloc_size(
struct xfs_inode *ip,
loff_t offset,
loff_t count,
- xfs_extnum_t idx,
- struct xfs_bmbt_irec *prev)
+ xfs_extnum_t idx)
{
struct xfs_mount *mp = ip->i_mount;
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
+ struct xfs_bmbt_irec prev;
int shift = 0;
int64_t freesp;
xfs_fsblock_t qblocks;
@@ -419,8 +420,8 @@ xfs_iomap_prealloc_size(
*/
if ((mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) ||
XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_dalign) ||
- idx == 0 ||
- prev->br_startoff + prev->br_blockcount < offset_fsb)
+ !xfs_iext_get_extent(ifp, idx - 1, &prev) ||
+ prev.br_startoff + prev.br_blockcount < offset_fsb)
return mp->m_writeio_blocks;
/*
@@ -439,8 +440,8 @@ xfs_iomap_prealloc_size(
* always extends to MAXEXTLEN rather than falling short due to things
* like stripe unit/width alignment of real extents.
*/
- if (prev->br_blockcount <= (MAXEXTLEN >> 1))
- alloc_blocks = prev->br_blockcount << 1;
+ if (prev.br_blockcount <= (MAXEXTLEN >> 1))
+ alloc_blocks = prev.br_blockcount << 1;
else
alloc_blocks = XFS_B_TO_FSB(mp, offset);
if (!alloc_blocks)
@@ -535,11 +536,11 @@ xfs_file_iomap_begin_delay(
xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
xfs_fileoff_t maxbytes_fsb =
XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
- xfs_fileoff_t end_fsb, orig_end_fsb;
+ xfs_fileoff_t end_fsb;
int error = 0, eof = 0;
struct xfs_bmbt_irec got;
- struct xfs_bmbt_irec prev;
xfs_extnum_t idx;
+ xfs_fsblock_t prealloc_blocks = 0;
ASSERT(!XFS_IS_REALTIME_INODE(ip));
ASSERT(!xfs_get_extsz_hint(ip));
@@ -563,8 +564,7 @@ xfs_file_iomap_begin_delay(
goto out_unlock;
}
- xfs_bmap_search_extents(ip, offset_fsb, XFS_DATA_FORK, &eof, &idx,
- &got, &prev);
+ eof = !xfs_iext_lookup_extent(ip, ifp, offset_fsb, &idx, &got);
if (!eof && got.br_startoff <= offset_fsb) {
if (xfs_is_reflink_inode(ip)) {
bool shared;
@@ -595,35 +595,32 @@ xfs_file_iomap_begin_delay(
* the lower level functions are updated.
*/
count = min_t(loff_t, count, 1024 * PAGE_SIZE);
- end_fsb = orig_end_fsb =
- min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
+ end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
if (eof) {
- xfs_fsblock_t prealloc_blocks;
-
- prealloc_blocks =
- xfs_iomap_prealloc_size(ip, offset, count, idx, &prev);
+ prealloc_blocks = xfs_iomap_prealloc_size(ip, offset, count, idx);
if (prealloc_blocks) {
xfs_extlen_t align;
xfs_off_t end_offset;
+ xfs_fileoff_t p_end_fsb;
end_offset = XFS_WRITEIO_ALIGN(mp, offset + count - 1);
- end_fsb = XFS_B_TO_FSBT(mp, end_offset) +
- prealloc_blocks;
+ p_end_fsb = XFS_B_TO_FSBT(mp, end_offset) +
+ prealloc_blocks;
align = xfs_eof_alignment(ip, 0);
if (align)
- end_fsb = roundup_64(end_fsb, align);
+ p_end_fsb = roundup_64(p_end_fsb, align);
- end_fsb = min(end_fsb, maxbytes_fsb);
- ASSERT(end_fsb > offset_fsb);
+ p_end_fsb = min(p_end_fsb, maxbytes_fsb);
+ ASSERT(p_end_fsb > offset_fsb);
+ prealloc_blocks = p_end_fsb - end_fsb;
}
}
retry:
error = xfs_bmapi_reserve_delalloc(ip, XFS_DATA_FORK, offset_fsb,
- end_fsb - offset_fsb, &got,
- &prev, &idx, eof);
+ end_fsb - offset_fsb, prealloc_blocks, &got, &idx, eof);
switch (error) {
case 0:
break;
@@ -631,8 +628,8 @@ retry:
case -EDQUOT:
/* retry without any preallocation */
trace_xfs_delalloc_enospc(ip, offset, count);
- if (end_fsb != orig_end_fsb) {
- end_fsb = orig_end_fsb;
+ if (prealloc_blocks) {
+ prealloc_blocks = 0;
goto retry;
}
/*FALLTHRU*/
@@ -640,13 +637,6 @@ retry:
goto out_unlock;
}
- /*
- * Tag the inode as speculatively preallocated so we can reclaim this
- * space on demand, if necessary.
- */
- if (end_fsb != orig_end_fsb)
- xfs_inode_set_eofblocks_tag(ip);
-
trace_xfs_iomap_alloc(ip, offset, count, 0, &got);
done:
if (isnullstartblock(got.br_startblock))
@@ -960,6 +950,19 @@ static inline bool imap_needs_alloc(struct inode *inode,
(IS_DAX(inode) && ISUNWRITTEN(imap));
}
+static inline bool need_excl_ilock(struct xfs_inode *ip, unsigned flags)
+{
+ /*
+ * COW writes will allocate delalloc space, so we need to make sure
+ * to take the lock exclusively here.
+ */
+ if (xfs_is_reflink_inode(ip) && (flags & (IOMAP_WRITE | IOMAP_ZERO)))
+ return true;
+ if ((flags & IOMAP_DIRECT) && (flags & IOMAP_WRITE))
+ return true;
+ return false;
+}
+
static int
xfs_file_iomap_begin(
struct inode *inode,
@@ -979,18 +982,14 @@ xfs_file_iomap_begin(
if (XFS_FORCED_SHUTDOWN(mp))
return -EIO;
- if ((flags & IOMAP_WRITE) && !IS_DAX(inode) &&
- !xfs_get_extsz_hint(ip)) {
+ if (((flags & (IOMAP_WRITE | IOMAP_DIRECT)) == IOMAP_WRITE) &&
+ !IS_DAX(inode) && !xfs_get_extsz_hint(ip)) {
/* Reserve delalloc blocks for regular writeback. */
return xfs_file_iomap_begin_delay(inode, offset, length, flags,
iomap);
}
- /*
- * COW writes will allocate delalloc space, so we need to make sure
- * to take the lock exclusively here.
- */
- if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) && xfs_is_reflink_inode(ip)) {
+ if (need_excl_ilock(ip, flags)) {
lockmode = XFS_ILOCK_EXCL;
xfs_ilock(ip, XFS_ILOCK_EXCL);
} else {
@@ -1003,17 +1002,41 @@ xfs_file_iomap_begin(
offset_fsb = XFS_B_TO_FSBT(mp, offset);
end_fsb = XFS_B_TO_FSB(mp, offset + length);
+ if (xfs_is_reflink_inode(ip) &&
+ (flags & IOMAP_WRITE) && (flags & IOMAP_DIRECT)) {
+ shared = xfs_reflink_find_cow_mapping(ip, offset, &imap);
+ if (shared) {
+ xfs_iunlock(ip, lockmode);
+ goto alloc_done;
+ }
+ ASSERT(!isnullstartblock(imap.br_startblock));
+ }
+
error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
&nimaps, 0);
if (error)
goto out_unlock;
- if (flags & IOMAP_REPORT) {
+ if ((flags & IOMAP_REPORT) ||
+ (xfs_is_reflink_inode(ip) &&
+ (flags & IOMAP_WRITE) && (flags & IOMAP_DIRECT))) {
/* Trim the mapping to the nearest shared extent boundary. */
error = xfs_reflink_trim_around_shared(ip, &imap, &shared,
&trimmed);
if (error)
goto out_unlock;
+
+ /*
+ * We're here because we're trying to do a directio write to a
+ * region that isn't aligned to a filesystem block. If the
+ * extent is shared, fall back to buffered mode to handle the
+ * RMW.
+ */
+ if (!(flags & IOMAP_REPORT) && shared) {
+ trace_xfs_reflink_bounce_dio_write(ip, &imap);
+ error = -EREMCHG;
+ goto out_unlock;
+ }
}
if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) && xfs_is_reflink_inode(ip)) {
@@ -1048,6 +1071,7 @@ xfs_file_iomap_begin(
if (error)
return error;
+alloc_done:
iomap->flags = IOMAP_F_NEW;
trace_xfs_iomap_alloc(ip, offset, length, 0, &imap);
} else {
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 405a65cd9d6b..b930be0b1596 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -983,15 +983,13 @@ xfs_vn_setattr(
struct xfs_inode *ip = XFS_I(d_inode(dentry));
uint iolock = XFS_IOLOCK_EXCL;
- xfs_ilock(ip, iolock);
- error = xfs_break_layouts(d_inode(dentry), &iolock, true);
- if (!error) {
- xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
- iolock |= XFS_MMAPLOCK_EXCL;
+ error = xfs_break_layouts(d_inode(dentry), &iolock);
+ if (error)
+ return error;
- error = xfs_vn_setattr_size(dentry, iattr);
- }
- xfs_iunlock(ip, iolock);
+ xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
+ error = xfs_vn_setattr_size(dentry, iattr);
+ xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
} else {
error = xfs_vn_setattr_nonsize(dentry, iattr);
}
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index 68640fb63a54..a415f822f2c1 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -78,6 +78,7 @@ typedef __u32 xfs_nlink_t;
#include <linux/freezer.h>
#include <linux/list_sort.h>
#include <linux/ratelimit.h>
+#include <linux/rhashtable.h>
#include <asm/page.h>
#include <asm/div64.h>
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 3b74fa011bb1..c39ac14ff540 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -1668,7 +1668,7 @@ xlog_cksum(
__uint32_t crc;
/* first generate the crc for the record header ... */
- crc = xfs_start_cksum((char *)rhead,
+ crc = xfs_start_cksum_update((char *)rhead,
sizeof(struct xlog_rec_header),
offsetof(struct xlog_rec_header, h_crc));
@@ -1862,26 +1862,21 @@ xlog_sync(
bp->b_io_length = BTOBB(count);
bp->b_fspriv = iclog;
- bp->b_flags &= ~(XBF_FUA | XBF_FLUSH);
- bp->b_flags |= (XBF_ASYNC | XBF_SYNCIO | XBF_WRITE);
+ bp->b_flags &= ~XBF_FLUSH;
+ bp->b_flags |= (XBF_ASYNC | XBF_SYNCIO | XBF_WRITE | XBF_FUA);
- if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) {
- bp->b_flags |= XBF_FUA;
-
- /*
- * Flush the data device before flushing the log to make
- * sure all meta data written back from the AIL actually made
- * it to disk before stamping the new log tail LSN into the
- * log buffer. For an external log we need to issue the
- * flush explicitly, and unfortunately synchronously here;
- * for an internal log we can simply use the block layer
- * state machine for preflushes.
- */
- if (log->l_mp->m_logdev_targp != log->l_mp->m_ddev_targp)
- xfs_blkdev_issue_flush(log->l_mp->m_ddev_targp);
- else
- bp->b_flags |= XBF_FLUSH;
- }
+ /*
+ * Flush the data device before flushing the log to make sure all meta
+ * data written back from the AIL actually made it to disk before
+ * stamping the new log tail LSN into the log buffer. For an external
+ * log we need to issue the flush explicitly, and unfortunately
+ * synchronously here; for an internal log we can simply use the block
+ * layer state machine for preflushes.
+ */
+ if (log->l_mp->m_logdev_targp != log->l_mp->m_ddev_targp)
+ xfs_blkdev_issue_flush(log->l_mp->m_ddev_targp);
+ else
+ bp->b_flags |= XBF_FLUSH;
ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1);
ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize);
@@ -1906,10 +1901,8 @@ xlog_sync(
xfs_buf_associate_memory(bp,
(char *)&iclog->ic_header + count, split);
bp->b_fspriv = iclog;
- bp->b_flags &= ~(XBF_FUA | XBF_FLUSH);
- bp->b_flags |= (XBF_ASYNC | XBF_SYNCIO | XBF_WRITE);
- if (log->l_mp->m_flags & XFS_MOUNT_BARRIER)
- bp->b_flags |= XBF_FUA;
+ bp->b_flags &= ~XBF_FLUSH;
+ bp->b_flags |= (XBF_ASYNC | XBF_SYNCIO | XBF_WRITE | XBF_FUA);
ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1);
ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize);
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 9b3d7c76915d..4a98762ec8b4 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -2025,7 +2025,7 @@ xlog_peek_buffer_cancelled(
struct xlog *log,
xfs_daddr_t blkno,
uint len,
- ushort flags)
+ unsigned short flags)
{
struct list_head *bucket;
struct xfs_buf_cancel *bcp;
@@ -2065,7 +2065,7 @@ xlog_check_buffer_cancelled(
struct xlog *log,
xfs_daddr_t blkno,
uint len,
- ushort flags)
+ unsigned short flags)
{
struct xfs_buf_cancel *bcp;
@@ -5113,19 +5113,21 @@ xlog_recover_process(
struct list_head *buffer_list)
{
int error;
+ __le32 old_crc = rhead->h_crc;
__le32 crc;
+
crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len));
/*
* Nothing else to do if this is a CRC verification pass. Just return
* if this a record with a non-zero crc. Unfortunately, mkfs always
- * sets h_crc to 0 so we must consider this valid even on v5 supers.
+ * sets old_crc to 0 so we must consider this valid even on v5 supers.
* Otherwise, return EFSBADCRC on failure so the callers up the stack
* know precisely what failed.
*/
if (pass == XLOG_RECOVER_CRCPASS) {
- if (rhead->h_crc && crc != rhead->h_crc)
+ if (old_crc && crc != old_crc)
return -EFSBADCRC;
return 0;
}
@@ -5136,11 +5138,11 @@ xlog_recover_process(
* zero CRC check prevents warnings from being emitted when upgrading
* the kernel from one that does not add CRCs by default.
*/
- if (crc != rhead->h_crc) {
- if (rhead->h_crc || xfs_sb_version_hascrc(&log->l_mp->m_sb)) {
+ if (crc != old_crc) {
+ if (old_crc || xfs_sb_version_hascrc(&log->l_mp->m_sb)) {
xfs_alert(log->l_mp,
"log record CRC mismatch: found 0x%x, expected 0x%x.",
- le32_to_cpu(rhead->h_crc),
+ le32_to_cpu(old_crc),
le32_to_cpu(crc));
xfs_hex_dump(dp, 32);
}
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index b341f10cf481..9b9540db17a6 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -157,6 +157,7 @@ xfs_free_perag(
spin_unlock(&mp->m_perag_lock);
ASSERT(pag);
ASSERT(atomic_read(&pag->pag_ref) == 0);
+ xfs_buf_hash_destroy(pag);
call_rcu(&pag->rcu_head, __xfs_free_perag);
}
}
@@ -212,8 +213,8 @@ xfs_initialize_perag(
spin_lock_init(&pag->pag_ici_lock);
mutex_init(&pag->pag_ici_reclaim_lock);
INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
- spin_lock_init(&pag->pag_buf_lock);
- pag->pag_buf_tree = RB_ROOT;
+ if (xfs_buf_hash_init(pag))
+ goto out_unwind;
if (radix_tree_preload(GFP_NOFS))
goto out_unwind;
@@ -239,9 +240,11 @@ xfs_initialize_perag(
return 0;
out_unwind:
+ xfs_buf_hash_destroy(pag);
kmem_free(pag);
for (; index > first_initialised; index--) {
pag = radix_tree_delete(&mp->m_perag_tree, index);
+ xfs_buf_hash_destroy(pag);
kmem_free(pag);
}
return error;
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 819b80b15bfb..84f785218907 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -393,8 +393,8 @@ typedef struct xfs_perag {
unsigned long pag_ici_reclaim_cursor; /* reclaim restart point */
/* buffer cache index */
- spinlock_t pag_buf_lock; /* lock for pag_buf_tree */
- struct rb_root pag_buf_tree; /* ordered tree of active buffers */
+ spinlock_t pag_buf_lock; /* lock for pag_buf_hash */
+ struct rhashtable pag_buf_hash;
/* for rcu-safe freeing */
struct rcu_head rcu_head;
@@ -424,6 +424,9 @@ xfs_perag_resv(
}
}
+int xfs_buf_hash_init(xfs_perag_t *pag);
+void xfs_buf_hash_destroy(xfs_perag_t *pag);
+
extern void xfs_uuid_table_free(void);
extern int xfs_log_sbcount(xfs_mount_t *);
extern __uint64_t xfs_default_resblks(xfs_mount_t *mp);
diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
index 93a7aafa56d6..2f2dc3c09ad0 100644
--- a/fs/xfs/xfs_pnfs.c
+++ b/fs/xfs/xfs_pnfs.c
@@ -32,8 +32,7 @@
int
xfs_break_layouts(
struct inode *inode,
- uint *iolock,
- bool with_imutex)
+ uint *iolock)
{
struct xfs_inode *ip = XFS_I(inode);
int error;
@@ -42,12 +41,8 @@ xfs_break_layouts(
while ((error = break_layout(inode, false) == -EWOULDBLOCK)) {
xfs_iunlock(ip, *iolock);
- if (with_imutex && (*iolock & XFS_IOLOCK_EXCL))
- inode_unlock(inode);
error = break_layout(inode, true);
*iolock = XFS_IOLOCK_EXCL;
- if (with_imutex)
- inode_lock(inode);
xfs_ilock(ip, *iolock);
}
diff --git a/fs/xfs/xfs_pnfs.h b/fs/xfs/xfs_pnfs.h
index e8339f74966b..b587cb99b2b7 100644
--- a/fs/xfs/xfs_pnfs.h
+++ b/fs/xfs/xfs_pnfs.h
@@ -8,10 +8,10 @@ int xfs_fs_map_blocks(struct inode *inode, loff_t offset, u64 length,
int xfs_fs_commit_blocks(struct inode *inode, struct iomap *maps, int nr_maps,
struct iattr *iattr);
-int xfs_break_layouts(struct inode *inode, uint *iolock, bool with_imutex);
+int xfs_break_layouts(struct inode *inode, uint *iolock);
#else
static inline int
-xfs_break_layouts(struct inode *inode, uint *iolock, bool with_imutex)
+xfs_break_layouts(struct inode *inode, uint *iolock)
{
return 0;
}
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index a60d9e2739d1..45e50ea90769 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -1135,7 +1135,7 @@ xfs_qm_get_rtblks(
return error;
}
rtblks = 0;
- nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+ nextents = xfs_iext_count(ifp);
for (idx = 0; idx < nextents; idx++)
rtblks += xfs_bmbt_get_blockcount(xfs_iext_get_ext(ifp, idx));
*O_rtblks = (xfs_qcnt_t)rtblks;
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index a279b4e7f5fe..88fd03c66e99 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -243,12 +243,11 @@ xfs_reflink_reserve_cow(
struct xfs_bmbt_irec *imap,
bool *shared)
{
- struct xfs_bmbt_irec got, prev;
- xfs_fileoff_t end_fsb, orig_end_fsb;
- int eof = 0, error = 0;
- bool trimmed;
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
+ struct xfs_bmbt_irec got;
+ int error = 0;
+ bool eof = false, trimmed;
xfs_extnum_t idx;
- xfs_extlen_t align;
/*
* Search the COW fork extent list first. This serves two purposes:
@@ -258,8 +257,9 @@ xfs_reflink_reserve_cow(
* extent list is generally faster than going out to the shared extent
* tree.
*/
- xfs_bmap_search_extents(ip, imap->br_startoff, XFS_COW_FORK, &eof, &idx,
- &got, &prev);
+
+ if (!xfs_iext_lookup_extent(ip, ifp, imap->br_startoff, &idx, &got))
+ eof = true;
if (!eof && got.br_startoff <= imap->br_startoff) {
trace_xfs_reflink_cow_found(ip, imap);
xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
@@ -285,33 +285,12 @@ xfs_reflink_reserve_cow(
if (error)
return error;
- end_fsb = orig_end_fsb = imap->br_startoff + imap->br_blockcount;
-
- align = xfs_eof_alignment(ip, xfs_get_cowextsz_hint(ip));
- if (align)
- end_fsb = roundup_64(end_fsb, align);
-
-retry:
error = xfs_bmapi_reserve_delalloc(ip, XFS_COW_FORK, imap->br_startoff,
- end_fsb - imap->br_startoff, &got, &prev, &idx, eof);
- switch (error) {
- case 0:
- break;
- case -ENOSPC:
- case -EDQUOT:
- /* retry without any preallocation */
+ imap->br_blockcount, 0, &got, &idx, eof);
+ if (error == -ENOSPC || error == -EDQUOT)
trace_xfs_reflink_cow_enospc(ip, imap);
- if (end_fsb != orig_end_fsb) {
- end_fsb = orig_end_fsb;
- goto retry;
- }
- /*FALLTHRU*/
- default:
+ if (error)
return error;
- }
-
- if (end_fsb != orig_end_fsb)
- xfs_inode_set_cowblocks_tag(ip);
trace_xfs_reflink_cow_alloc(ip, &got);
return 0;
@@ -418,87 +397,65 @@ xfs_reflink_allocate_cow_range(
}
/*
- * Find the CoW reservation (and whether or not it needs block allocation)
- * for a given byte offset of a file.
+ * Find the CoW reservation for a given byte offset of a file.
*/
bool
xfs_reflink_find_cow_mapping(
struct xfs_inode *ip,
xfs_off_t offset,
- struct xfs_bmbt_irec *imap,
- bool *need_alloc)
+ struct xfs_bmbt_irec *imap)
{
- struct xfs_bmbt_irec irec;
- struct xfs_ifork *ifp;
- struct xfs_bmbt_rec_host *gotp;
- xfs_fileoff_t bno;
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
+ xfs_fileoff_t offset_fsb;
+ struct xfs_bmbt_irec got;
xfs_extnum_t idx;
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED));
ASSERT(xfs_is_reflink_inode(ip));
- /* Find the extent in the CoW fork. */
- ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
- bno = XFS_B_TO_FSBT(ip->i_mount, offset);
- gotp = xfs_iext_bno_to_ext(ifp, bno, &idx);
- if (!gotp)
+ offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
+ if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &idx, &got))
return false;
-
- xfs_bmbt_get_all(gotp, &irec);
- if (bno >= irec.br_startoff + irec.br_blockcount ||
- bno < irec.br_startoff)
+ if (got.br_startoff > offset_fsb)
return false;
trace_xfs_reflink_find_cow_mapping(ip, offset, 1, XFS_IO_OVERWRITE,
- &irec);
-
- /* If it's still delalloc, we must allocate later. */
- *imap = irec;
- *need_alloc = !!(isnullstartblock(irec.br_startblock));
-
+ &got);
+ *imap = got;
return true;
}
/*
* Trim an extent to end at the next CoW reservation past offset_fsb.
*/
-int
+void
xfs_reflink_trim_irec_to_next_cow(
struct xfs_inode *ip,
xfs_fileoff_t offset_fsb,
struct xfs_bmbt_irec *imap)
{
- struct xfs_bmbt_irec irec;
- struct xfs_ifork *ifp;
- struct xfs_bmbt_rec_host *gotp;
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
+ struct xfs_bmbt_irec got;
xfs_extnum_t idx;
if (!xfs_is_reflink_inode(ip))
- return 0;
+ return;
/* Find the extent in the CoW fork. */
- ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
- gotp = xfs_iext_bno_to_ext(ifp, offset_fsb, &idx);
- if (!gotp)
- return 0;
- xfs_bmbt_get_all(gotp, &irec);
+ if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &idx, &got))
+ return;
/* This is the extent before; try sliding up one. */
- if (irec.br_startoff < offset_fsb) {
- idx++;
- if (idx >= ifp->if_bytes / sizeof(xfs_bmbt_rec_t))
- return 0;
- gotp = xfs_iext_get_ext(ifp, idx);
- xfs_bmbt_get_all(gotp, &irec);
+ if (got.br_startoff < offset_fsb) {
+ if (!xfs_iext_get_extent(ifp, idx + 1, &got))
+ return;
}
- if (irec.br_startoff >= imap->br_startoff + imap->br_blockcount)
- return 0;
+ if (got.br_startoff >= imap->br_startoff + imap->br_blockcount)
+ return;
- imap->br_blockcount = irec.br_startoff - imap->br_startoff;
+ imap->br_blockcount = got.br_startoff - imap->br_startoff;
trace_xfs_reflink_trim_irec(ip, imap);
-
- return 0;
}
/*
@@ -512,18 +469,15 @@ xfs_reflink_cancel_cow_blocks(
xfs_fileoff_t end_fsb)
{
struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
- struct xfs_bmbt_irec got, prev, del;
+ struct xfs_bmbt_irec got, del;
xfs_extnum_t idx;
xfs_fsblock_t firstfsb;
struct xfs_defer_ops dfops;
- int error = 0, eof = 0;
+ int error = 0;
if (!xfs_is_reflink_inode(ip))
return 0;
-
- xfs_bmap_search_extents(ip, offset_fsb, XFS_COW_FORK, &eof, &idx,
- &got, &prev);
- if (eof)
+ if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &idx, &got))
return 0;
while (got.br_startoff < end_fsb) {
@@ -566,9 +520,8 @@ xfs_reflink_cancel_cow_blocks(
xfs_bmap_del_extent_cow(ip, &idx, &got, &del);
}
- if (++idx >= ifp->if_bytes / sizeof(struct xfs_bmbt_rec))
+ if (!xfs_iext_get_extent(ifp, ++idx, &got))
break;
- xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), &got);
}
/* clear tag if cow fork is emptied */
@@ -638,13 +591,13 @@ xfs_reflink_end_cow(
xfs_off_t count)
{
struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
- struct xfs_bmbt_irec got, prev, del;
+ struct xfs_bmbt_irec got, del;
struct xfs_trans *tp;
xfs_fileoff_t offset_fsb;
xfs_fileoff_t end_fsb;
xfs_fsblock_t firstfsb;
struct xfs_defer_ops dfops;
- int error, eof = 0;
+ int error;
unsigned int resblks;
xfs_filblks_t rlen;
xfs_extnum_t idx;
@@ -668,13 +621,11 @@ xfs_reflink_end_cow(
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, 0);
- xfs_bmap_search_extents(ip, end_fsb - 1, XFS_COW_FORK, &eof, &idx,
- &got, &prev);
-
/* If there is a hole at end_fsb - 1 go to the previous extent */
- if (eof || got.br_startoff > end_fsb) {
+ if (!xfs_iext_lookup_extent(ip, ifp, end_fsb - 1, &idx, &got) ||
+ got.br_startoff > end_fsb) {
ASSERT(idx > 0);
- xfs_bmbt_get_all(xfs_iext_get_ext(ifp, --idx), &got);
+ xfs_iext_get_extent(ifp, --idx, &got);
}
/* Walk backwards until we're out of the I/O range... */
@@ -722,11 +673,9 @@ xfs_reflink_end_cow(
error = xfs_defer_finish(&tp, &dfops, ip);
if (error)
goto out_defer;
-
next_extent:
- if (idx < 0)
+ if (!xfs_iext_get_extent(ifp, idx, &got))
break;
- xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), &got);
}
error = xfs_trans_commit(tp);
@@ -1302,13 +1251,11 @@ xfs_reflink_remap_range(
return -EIO;
/* Lock both files against IO */
- if (same_inode) {
- xfs_ilock(src, XFS_IOLOCK_EXCL);
+ lock_two_nondirectories(inode_in, inode_out);
+ if (same_inode)
xfs_ilock(src, XFS_MMAPLOCK_EXCL);
- } else {
- xfs_lock_two_inodes(src, dest, XFS_IOLOCK_EXCL);
+ else
xfs_lock_two_inodes(src, dest, XFS_MMAPLOCK_EXCL);
- }
/* Don't touch certain kinds of inodes */
ret = -EPERM;
@@ -1345,8 +1292,14 @@ xfs_reflink_remap_range(
goto out_unlock;
}
- if (len == 0)
+ /* Zero length dedupe exits immediately; reflink goes to EOF. */
+ if (len == 0) {
+ if (is_dedupe) {
+ ret = 0;
+ goto out_unlock;
+ }
len = isize - pos_in;
+ }
/* Ensure offsets don't wrap and the input is inside i_size */
if (pos_in + len < pos_in || pos_out + len < pos_out ||
@@ -1447,11 +1400,9 @@ xfs_reflink_remap_range(
out_unlock:
xfs_iunlock(src, XFS_MMAPLOCK_EXCL);
- xfs_iunlock(src, XFS_IOLOCK_EXCL);
- if (src->i_ino != dest->i_ino) {
+ if (!same_inode)
xfs_iunlock(dest, XFS_MMAPLOCK_EXCL);
- xfs_iunlock(dest, XFS_IOLOCK_EXCL);
- }
+ unlock_two_nondirectories(inode_in, inode_out);
if (ret)
trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
return ret;
@@ -1697,37 +1648,3 @@ out:
trace_xfs_reflink_unshare_error(ip, error, _RET_IP_);
return error;
}
-
-/*
- * Does this inode have any real CoW reservations?
- */
-bool
-xfs_reflink_has_real_cow_blocks(
- struct xfs_inode *ip)
-{
- struct xfs_bmbt_irec irec;
- struct xfs_ifork *ifp;
- struct xfs_bmbt_rec_host *gotp;
- xfs_extnum_t idx;
-
- if (!xfs_is_reflink_inode(ip))
- return false;
-
- /* Go find the old extent in the CoW fork. */
- ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
- gotp = xfs_iext_bno_to_ext(ifp, 0, &idx);
- while (gotp) {
- xfs_bmbt_get_all(gotp, &irec);
-
- if (!isnullstartblock(irec.br_startblock))
- return true;
-
- /* Roll on... */
- idx++;
- if (idx >= ifp->if_bytes / sizeof(xfs_bmbt_rec_t))
- break;
- gotp = xfs_iext_get_ext(ifp, idx);
- }
-
- return false;
-}
diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
index fad11607c9ad..aa6a4d64bd35 100644
--- a/fs/xfs/xfs_reflink.h
+++ b/fs/xfs/xfs_reflink.h
@@ -31,8 +31,8 @@ extern int xfs_reflink_reserve_cow(struct xfs_inode *ip,
extern int xfs_reflink_allocate_cow_range(struct xfs_inode *ip,
xfs_off_t offset, xfs_off_t count);
extern bool xfs_reflink_find_cow_mapping(struct xfs_inode *ip, xfs_off_t offset,
- struct xfs_bmbt_irec *imap, bool *need_alloc);
-extern int xfs_reflink_trim_irec_to_next_cow(struct xfs_inode *ip,
+ struct xfs_bmbt_irec *imap);
+extern void xfs_reflink_trim_irec_to_next_cow(struct xfs_inode *ip,
xfs_fileoff_t offset_fsb, struct xfs_bmbt_irec *imap);
extern int xfs_reflink_cancel_cow_blocks(struct xfs_inode *ip,
@@ -50,6 +50,4 @@ extern int xfs_reflink_clear_inode_flag(struct xfs_inode *ip,
extern int xfs_reflink_unshare(struct xfs_inode *ip, xfs_off_t offset,
xfs_off_t len);
-extern bool xfs_reflink_has_real_cow_blocks(struct xfs_inode *ip);
-
#endif /* __XFS_REFLINK_H */
diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c
index 12d48cd8f8a4..f11282c96887 100644
--- a/fs/xfs/xfs_stats.c
+++ b/fs/xfs/xfs_stats.c
@@ -80,9 +80,9 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf)
}
/* extra precision counters */
for_each_possible_cpu(i) {
- xs_xstrat_bytes += per_cpu_ptr(stats, i)->xs_xstrat_bytes;
- xs_write_bytes += per_cpu_ptr(stats, i)->xs_write_bytes;
- xs_read_bytes += per_cpu_ptr(stats, i)->xs_read_bytes;
+ xs_xstrat_bytes += per_cpu_ptr(stats, i)->s.xs_xstrat_bytes;
+ xs_write_bytes += per_cpu_ptr(stats, i)->s.xs_write_bytes;
+ xs_read_bytes += per_cpu_ptr(stats, i)->s.xs_read_bytes;
}
len += snprintf(buf + len, PATH_MAX-len, "xpc %Lu %Lu %Lu\n",
@@ -106,9 +106,9 @@ void xfs_stats_clearall(struct xfsstats __percpu *stats)
for_each_possible_cpu(c) {
preempt_disable();
/* save vn_active, it's a universal truth! */
- vn_active = per_cpu_ptr(stats, c)->vn_active;
+ vn_active = per_cpu_ptr(stats, c)->s.vn_active;
memset(per_cpu_ptr(stats, c), 0, sizeof(*stats));
- per_cpu_ptr(stats, c)->vn_active = vn_active;
+ per_cpu_ptr(stats, c)->s.vn_active = vn_active;
preempt_enable();
}
}
diff --git a/fs/xfs/xfs_stats.h b/fs/xfs/xfs_stats.h
index 79ad2e69fc33..375840f5a99a 100644
--- a/fs/xfs/xfs_stats.h
+++ b/fs/xfs/xfs_stats.h
@@ -22,9 +22,37 @@
#include <linux/percpu.h>
/*
+ * The btree stats arrays have fixed offsets for the different stats. We
+ * store the base index in the btree cursor via XFS_STATS_CALC_INDEX() and
+ * that allows us to use fixed offsets into the stats array for each btree
+ * stat. These index offsets are defined in the order they will be emitted
+ * in the stats files, so it is possible to add new btree stat types by
+ * appending to the enum list below.
+ */
+enum {
+ __XBTS_lookup = 0,
+ __XBTS_compare = 1,
+ __XBTS_insrec = 2,
+ __XBTS_delrec = 3,
+ __XBTS_newroot = 4,
+ __XBTS_killroot = 5,
+ __XBTS_increment = 6,
+ __XBTS_decrement = 7,
+ __XBTS_lshift = 8,
+ __XBTS_rshift = 9,
+ __XBTS_split = 10,
+ __XBTS_join = 11,
+ __XBTS_alloc = 12,
+ __XBTS_free = 13,
+ __XBTS_moves = 14,
+
+ __XBTS_MAX = 15,
+};
+
+/*
* XFS global statistics
*/
-struct xfsstats {
+struct __xfsstats {
# define XFSSTAT_END_EXTENT_ALLOC 4
__uint32_t xs_allocx;
__uint32_t xs_allocb;
@@ -117,118 +145,20 @@ struct xfsstats {
__uint32_t xb_page_found;
__uint32_t xb_get_read;
/* Version 2 btree counters */
-#define XFSSTAT_END_ABTB_V2 (XFSSTAT_END_BUF+15)
- __uint32_t xs_abtb_2_lookup;
- __uint32_t xs_abtb_2_compare;
- __uint32_t xs_abtb_2_insrec;
- __uint32_t xs_abtb_2_delrec;
- __uint32_t xs_abtb_2_newroot;
- __uint32_t xs_abtb_2_killroot;
- __uint32_t xs_abtb_2_increment;
- __uint32_t xs_abtb_2_decrement;
- __uint32_t xs_abtb_2_lshift;
- __uint32_t xs_abtb_2_rshift;
- __uint32_t xs_abtb_2_split;
- __uint32_t xs_abtb_2_join;
- __uint32_t xs_abtb_2_alloc;
- __uint32_t xs_abtb_2_free;
- __uint32_t xs_abtb_2_moves;
-#define XFSSTAT_END_ABTC_V2 (XFSSTAT_END_ABTB_V2+15)
- __uint32_t xs_abtc_2_lookup;
- __uint32_t xs_abtc_2_compare;
- __uint32_t xs_abtc_2_insrec;
- __uint32_t xs_abtc_2_delrec;
- __uint32_t xs_abtc_2_newroot;
- __uint32_t xs_abtc_2_killroot;
- __uint32_t xs_abtc_2_increment;
- __uint32_t xs_abtc_2_decrement;
- __uint32_t xs_abtc_2_lshift;
- __uint32_t xs_abtc_2_rshift;
- __uint32_t xs_abtc_2_split;
- __uint32_t xs_abtc_2_join;
- __uint32_t xs_abtc_2_alloc;
- __uint32_t xs_abtc_2_free;
- __uint32_t xs_abtc_2_moves;
-#define XFSSTAT_END_BMBT_V2 (XFSSTAT_END_ABTC_V2+15)
- __uint32_t xs_bmbt_2_lookup;
- __uint32_t xs_bmbt_2_compare;
- __uint32_t xs_bmbt_2_insrec;
- __uint32_t xs_bmbt_2_delrec;
- __uint32_t xs_bmbt_2_newroot;
- __uint32_t xs_bmbt_2_killroot;
- __uint32_t xs_bmbt_2_increment;
- __uint32_t xs_bmbt_2_decrement;
- __uint32_t xs_bmbt_2_lshift;
- __uint32_t xs_bmbt_2_rshift;
- __uint32_t xs_bmbt_2_split;
- __uint32_t xs_bmbt_2_join;
- __uint32_t xs_bmbt_2_alloc;
- __uint32_t xs_bmbt_2_free;
- __uint32_t xs_bmbt_2_moves;
-#define XFSSTAT_END_IBT_V2 (XFSSTAT_END_BMBT_V2+15)
- __uint32_t xs_ibt_2_lookup;
- __uint32_t xs_ibt_2_compare;
- __uint32_t xs_ibt_2_insrec;
- __uint32_t xs_ibt_2_delrec;
- __uint32_t xs_ibt_2_newroot;
- __uint32_t xs_ibt_2_killroot;
- __uint32_t xs_ibt_2_increment;
- __uint32_t xs_ibt_2_decrement;
- __uint32_t xs_ibt_2_lshift;
- __uint32_t xs_ibt_2_rshift;
- __uint32_t xs_ibt_2_split;
- __uint32_t xs_ibt_2_join;
- __uint32_t xs_ibt_2_alloc;
- __uint32_t xs_ibt_2_free;
- __uint32_t xs_ibt_2_moves;
-#define XFSSTAT_END_FIBT_V2 (XFSSTAT_END_IBT_V2+15)
- __uint32_t xs_fibt_2_lookup;
- __uint32_t xs_fibt_2_compare;
- __uint32_t xs_fibt_2_insrec;
- __uint32_t xs_fibt_2_delrec;
- __uint32_t xs_fibt_2_newroot;
- __uint32_t xs_fibt_2_killroot;
- __uint32_t xs_fibt_2_increment;
- __uint32_t xs_fibt_2_decrement;
- __uint32_t xs_fibt_2_lshift;
- __uint32_t xs_fibt_2_rshift;
- __uint32_t xs_fibt_2_split;
- __uint32_t xs_fibt_2_join;
- __uint32_t xs_fibt_2_alloc;
- __uint32_t xs_fibt_2_free;
- __uint32_t xs_fibt_2_moves;
-#define XFSSTAT_END_RMAP_V2 (XFSSTAT_END_FIBT_V2+15)
- __uint32_t xs_rmap_2_lookup;
- __uint32_t xs_rmap_2_compare;
- __uint32_t xs_rmap_2_insrec;
- __uint32_t xs_rmap_2_delrec;
- __uint32_t xs_rmap_2_newroot;
- __uint32_t xs_rmap_2_killroot;
- __uint32_t xs_rmap_2_increment;
- __uint32_t xs_rmap_2_decrement;
- __uint32_t xs_rmap_2_lshift;
- __uint32_t xs_rmap_2_rshift;
- __uint32_t xs_rmap_2_split;
- __uint32_t xs_rmap_2_join;
- __uint32_t xs_rmap_2_alloc;
- __uint32_t xs_rmap_2_free;
- __uint32_t xs_rmap_2_moves;
-#define XFSSTAT_END_REFCOUNT (XFSSTAT_END_RMAP_V2 + 15)
- __uint32_t xs_refcbt_2_lookup;
- __uint32_t xs_refcbt_2_compare;
- __uint32_t xs_refcbt_2_insrec;
- __uint32_t xs_refcbt_2_delrec;
- __uint32_t xs_refcbt_2_newroot;
- __uint32_t xs_refcbt_2_killroot;
- __uint32_t xs_refcbt_2_increment;
- __uint32_t xs_refcbt_2_decrement;
- __uint32_t xs_refcbt_2_lshift;
- __uint32_t xs_refcbt_2_rshift;
- __uint32_t xs_refcbt_2_split;
- __uint32_t xs_refcbt_2_join;
- __uint32_t xs_refcbt_2_alloc;
- __uint32_t xs_refcbt_2_free;
- __uint32_t xs_refcbt_2_moves;
+#define XFSSTAT_END_ABTB_V2 (XFSSTAT_END_BUF + __XBTS_MAX)
+ __uint32_t xs_abtb_2[__XBTS_MAX];
+#define XFSSTAT_END_ABTC_V2 (XFSSTAT_END_ABTB_V2 + __XBTS_MAX)
+ __uint32_t xs_abtc_2[__XBTS_MAX];
+#define XFSSTAT_END_BMBT_V2 (XFSSTAT_END_ABTC_V2 + __XBTS_MAX)
+ __uint32_t xs_bmbt_2[__XBTS_MAX];
+#define XFSSTAT_END_IBT_V2 (XFSSTAT_END_BMBT_V2 + __XBTS_MAX)
+ __uint32_t xs_ibt_2[__XBTS_MAX];
+#define XFSSTAT_END_FIBT_V2 (XFSSTAT_END_IBT_V2 + __XBTS_MAX)
+ __uint32_t xs_fibt_2[__XBTS_MAX];
+#define XFSSTAT_END_RMAP_V2 (XFSSTAT_END_FIBT_V2 + __XBTS_MAX)
+ __uint32_t xs_rmap_2[__XBTS_MAX];
+#define XFSSTAT_END_REFCOUNT (XFSSTAT_END_RMAP_V2 + __XBTS_MAX)
+ __uint32_t xs_refcbt_2[__XBTS_MAX];
#define XFSSTAT_END_XQMSTAT (XFSSTAT_END_REFCOUNT + 6)
__uint32_t xs_qm_dqreclaims;
__uint32_t xs_qm_dqreclaim_misses;
@@ -245,26 +175,58 @@ struct xfsstats {
__uint64_t xs_read_bytes;
};
+struct xfsstats {
+ union {
+ struct __xfsstats s;
+ uint32_t a[XFSSTAT_END_XQMSTAT];
+ };
+};
+
+/*
+ * simple wrapper for getting the array index of s struct member offset
+ */
+#define XFS_STATS_CALC_INDEX(member) \
+ (offsetof(struct __xfsstats, member) / (int)sizeof(__uint32_t))
+
+
int xfs_stats_format(struct xfsstats __percpu *stats, char *buf);
void xfs_stats_clearall(struct xfsstats __percpu *stats);
extern struct xstats xfsstats;
#define XFS_STATS_INC(mp, v) \
do { \
- per_cpu_ptr(xfsstats.xs_stats, current_cpu())->v++; \
- per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->v++; \
+ per_cpu_ptr(xfsstats.xs_stats, current_cpu())->s.v++; \
+ per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->s.v++; \
} while (0)
#define XFS_STATS_DEC(mp, v) \
do { \
- per_cpu_ptr(xfsstats.xs_stats, current_cpu())->v--; \
- per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->v--; \
+ per_cpu_ptr(xfsstats.xs_stats, current_cpu())->s.v--; \
+ per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->s.v--; \
} while (0)
#define XFS_STATS_ADD(mp, v, inc) \
do { \
- per_cpu_ptr(xfsstats.xs_stats, current_cpu())->v += (inc); \
- per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->v += (inc); \
+ per_cpu_ptr(xfsstats.xs_stats, current_cpu())->s.v += (inc); \
+ per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->s.v += (inc); \
+} while (0)
+
+#define XFS_STATS_INC_OFF(mp, off) \
+do { \
+ per_cpu_ptr(xfsstats.xs_stats, current_cpu())->a[off]++; \
+ per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->a[off]++; \
+} while (0)
+
+#define XFS_STATS_DEC_OFF(mp, off) \
+do { \
+ per_cpu_ptr(xfsstats.xs_stats, current_cpu())->a[off]; \
+ per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->a[off]; \
+} while (0)
+
+#define XFS_STATS_ADD_OFF(mp, off, inc) \
+do { \
+ per_cpu_ptr(xfsstats.xs_stats, current_cpu())->a[off] += (inc); \
+ per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->a[off] += (inc); \
} while (0)
#if defined(CONFIG_PROC_FS)
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index ade4691e3f74..eecbaac08eba 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -104,9 +104,6 @@ static const match_table_t tokens = {
{Opt_sysvgroups,"sysvgroups"}, /* group-ID from current process */
{Opt_allocsize, "allocsize=%s"},/* preferred allocation size */
{Opt_norecovery,"norecovery"}, /* don't run XFS recovery */
- {Opt_barrier, "barrier"}, /* use writer barriers for log write and
- * unwritten extent conversion */
- {Opt_nobarrier, "nobarrier"}, /* .. disable */
{Opt_inode64, "inode64"}, /* inodes can be allocated anywhere */
{Opt_inode32, "inode32"}, /* inode allocation limited to
* XFS_MAXINUMBER_32 */
@@ -134,6 +131,12 @@ static const match_table_t tokens = {
{Opt_nodiscard, "nodiscard"}, /* Do not discard unused blocks */
{Opt_dax, "dax"}, /* Enable direct access to bdev pages */
+
+ /* Deprecated mount options scheduled for removal */
+ {Opt_barrier, "barrier"}, /* use writer barriers for log write and
+ * unwritten extent conversion */
+ {Opt_nobarrier, "nobarrier"}, /* .. disable */
+
{Opt_err, NULL},
};
@@ -301,12 +304,6 @@ xfs_parseargs(
case Opt_nouuid:
mp->m_flags |= XFS_MOUNT_NOUUID;
break;
- case Opt_barrier:
- mp->m_flags |= XFS_MOUNT_BARRIER;
- break;
- case Opt_nobarrier:
- mp->m_flags &= ~XFS_MOUNT_BARRIER;
- break;
case Opt_ikeep:
mp->m_flags |= XFS_MOUNT_IKEEP;
break;
@@ -374,6 +371,14 @@ xfs_parseargs(
mp->m_flags |= XFS_MOUNT_DAX;
break;
#endif
+ case Opt_barrier:
+ xfs_warn(mp, "%s option is deprecated, ignoring.", p);
+ mp->m_flags |= XFS_MOUNT_BARRIER;
+ break;
+ case Opt_nobarrier:
+ xfs_warn(mp, "%s option is deprecated, ignoring.", p);
+ mp->m_flags &= ~XFS_MOUNT_BARRIER;
+ break;
default:
xfs_warn(mp, "unknown mount option [%s].", p);
return -EINVAL;
@@ -943,7 +948,7 @@ xfs_fs_destroy_inode(
trace_xfs_destroy_inode(ip);
- ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
+ ASSERT(!rwsem_is_locked(&inode->i_rwsem));
XFS_STATS_INC(ip->i_mount, vn_rele);
XFS_STATS_INC(ip->i_mount, vn_remove);
@@ -1238,9 +1243,11 @@ xfs_fs_remount(
token = match_token(p, tokens, args);
switch (token) {
case Opt_barrier:
+ xfs_warn(mp, "%s option is deprecated, ignoring.", p);
mp->m_flags |= XFS_MOUNT_BARRIER;
break;
case Opt_nobarrier:
+ xfs_warn(mp, "%s option is deprecated, ignoring.", p);
mp->m_flags &= ~XFS_MOUNT_BARRIER;
break;
case Opt_inode64:
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index 58142aeeeea6..f2cb45ed1d54 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -238,8 +238,7 @@ xfs_symlink(
if (error)
goto out_release_inode;
- xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL |
- XFS_IOLOCK_PARENT | XFS_ILOCK_PARENT);
+ xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
unlock_dp_on_error = true;
/*
@@ -287,7 +286,7 @@ xfs_symlink(
* the transaction cancel unlocking dp so don't do it explicitly in the
* error path.
*/
- xfs_trans_ijoin(tp, dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
unlock_dp_on_error = false;
/*
@@ -412,7 +411,7 @@ out_release_inode:
xfs_qm_dqrele(pdqp);
if (unlock_dp_on_error)
- xfs_iunlock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
+ xfs_iunlock(dp, XFS_ILOCK_EXCL);
return error;
}
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 0907752be62d..69c5bcd9a51b 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -355,7 +355,6 @@ DEFINE_BUF_EVENT(xfs_buf_rele);
DEFINE_BUF_EVENT(xfs_buf_iodone);
DEFINE_BUF_EVENT(xfs_buf_submit);
DEFINE_BUF_EVENT(xfs_buf_submit_wait);
-DEFINE_BUF_EVENT(xfs_buf_bawrite);
DEFINE_BUF_EVENT(xfs_buf_lock);
DEFINE_BUF_EVENT(xfs_buf_lock_done);
DEFINE_BUF_EVENT(xfs_buf_trylock_fail);
@@ -367,19 +366,15 @@ DEFINE_BUF_EVENT(xfs_buf_delwri_queue);
DEFINE_BUF_EVENT(xfs_buf_delwri_queued);
DEFINE_BUF_EVENT(xfs_buf_delwri_split);
DEFINE_BUF_EVENT(xfs_buf_get_uncached);
-DEFINE_BUF_EVENT(xfs_bdstrat_shut);
DEFINE_BUF_EVENT(xfs_buf_item_relse);
DEFINE_BUF_EVENT(xfs_buf_item_iodone_async);
DEFINE_BUF_EVENT(xfs_buf_error_relse);
DEFINE_BUF_EVENT(xfs_buf_wait_buftarg);
-DEFINE_BUF_EVENT(xfs_trans_read_buf_io);
DEFINE_BUF_EVENT(xfs_trans_read_buf_shut);
/* not really buffer traces, but the buf provides useful information */
DEFINE_BUF_EVENT(xfs_btree_corrupt);
-DEFINE_BUF_EVENT(xfs_da_btree_corrupt);
DEFINE_BUF_EVENT(xfs_reset_dqcounts);
-DEFINE_BUF_EVENT(xfs_inode_item_push);
/* pass flags explicitly */
DECLARE_EVENT_CLASS(xfs_buf_flags_class,
@@ -541,7 +536,6 @@ DEFINE_BUF_ITEM_EVENT(xfs_trans_bjoin);
DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold);
DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold_release);
DEFINE_BUF_ITEM_EVENT(xfs_trans_binval);
-DEFINE_BUF_ITEM_EVENT(xfs_trans_buf_ordered);
DECLARE_EVENT_CLASS(xfs_filestream_class,
TP_PROTO(struct xfs_inode *ip, xfs_agnumber_t agno),
@@ -680,7 +674,6 @@ DEFINE_INODE_EVENT(xfs_ioctl_setattr);
DEFINE_INODE_EVENT(xfs_dir_fsync);
DEFINE_INODE_EVENT(xfs_file_fsync);
DEFINE_INODE_EVENT(xfs_destroy_inode);
-DEFINE_INODE_EVENT(xfs_evict_inode);
DEFINE_INODE_EVENT(xfs_update_time);
DEFINE_INODE_EVENT(xfs_dquot_dqalloc);
@@ -798,7 +791,6 @@ TRACE_EVENT(xfs_irec_merge_post,
DEFINE_EVENT(xfs_iref_class, name, \
TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), \
TP_ARGS(ip, caller_ip))
-DEFINE_IREF_EVENT(xfs_ihold);
DEFINE_IREF_EVENT(xfs_irele);
DEFINE_IREF_EVENT(xfs_inode_pin);
DEFINE_IREF_EVENT(xfs_inode_unpin);
@@ -939,7 +931,6 @@ DEFINE_DQUOT_EVENT(xfs_dqget_miss);
DEFINE_DQUOT_EVENT(xfs_dqget_freeing);
DEFINE_DQUOT_EVENT(xfs_dqget_dup);
DEFINE_DQUOT_EVENT(xfs_dqput);
-DEFINE_DQUOT_EVENT(xfs_dqput_wait);
DEFINE_DQUOT_EVENT(xfs_dqput_free);
DEFINE_DQUOT_EVENT(xfs_dqrele);
DEFINE_DQUOT_EVENT(xfs_dqflush);
@@ -1815,7 +1806,6 @@ DEFINE_ATTR_EVENT(xfs_attr_sf_addname);
DEFINE_ATTR_EVENT(xfs_attr_sf_create);
DEFINE_ATTR_EVENT(xfs_attr_sf_lookup);
DEFINE_ATTR_EVENT(xfs_attr_sf_remove);
-DEFINE_ATTR_EVENT(xfs_attr_sf_removename);
DEFINE_ATTR_EVENT(xfs_attr_sf_to_leaf);
DEFINE_ATTR_EVENT(xfs_attr_leaf_add);
@@ -1844,7 +1834,6 @@ DEFINE_ATTR_EVENT(xfs_attr_leaf_toosmall);
DEFINE_ATTR_EVENT(xfs_attr_node_addname);
DEFINE_ATTR_EVENT(xfs_attr_node_get);
-DEFINE_ATTR_EVENT(xfs_attr_node_lookup);
DEFINE_ATTR_EVENT(xfs_attr_node_replace);
DEFINE_ATTR_EVENT(xfs_attr_node_removename);
@@ -2440,11 +2429,9 @@ DEFINE_DEFER_EVENT(xfs_defer_finish_done);
DEFINE_DEFER_ERROR_EVENT(xfs_defer_trans_roll_error);
DEFINE_DEFER_ERROR_EVENT(xfs_defer_finish_error);
-DEFINE_DEFER_ERROR_EVENT(xfs_defer_op_finish_error);
DEFINE_DEFER_PENDING_EVENT(xfs_defer_intake_work);
DEFINE_DEFER_PENDING_EVENT(xfs_defer_intake_cancel);
-DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_commit);
DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_cancel);
DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_finish);
DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_abort);
@@ -3092,87 +3079,6 @@ DEFINE_EVENT(xfs_double_io_class, name, \
struct xfs_inode *dest, xfs_off_t doffset), \
TP_ARGS(src, soffset, len, dest, doffset))
-/* two-file vfs io tracepoint class */
-DECLARE_EVENT_CLASS(xfs_double_vfs_io_class,
- TP_PROTO(struct inode *src, u64 soffset, u64 len,
- struct inode *dest, u64 doffset),
- TP_ARGS(src, soffset, len, dest, doffset),
- TP_STRUCT__entry(
- __field(dev_t, dev)
- __field(unsigned long, src_ino)
- __field(loff_t, src_isize)
- __field(loff_t, src_offset)
- __field(size_t, len)
- __field(unsigned long, dest_ino)
- __field(loff_t, dest_isize)
- __field(loff_t, dest_offset)
- ),
- TP_fast_assign(
- __entry->dev = src->i_sb->s_dev;
- __entry->src_ino = src->i_ino;
- __entry->src_isize = i_size_read(src);
- __entry->src_offset = soffset;
- __entry->len = len;
- __entry->dest_ino = dest->i_ino;
- __entry->dest_isize = i_size_read(dest);
- __entry->dest_offset = doffset;
- ),
- TP_printk("dev %d:%d count %zd "
- "ino 0x%lx isize 0x%llx offset 0x%llx -> "
- "ino 0x%lx isize 0x%llx offset 0x%llx",
- MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->len,
- __entry->src_ino,
- __entry->src_isize,
- __entry->src_offset,
- __entry->dest_ino,
- __entry->dest_isize,
- __entry->dest_offset)
-)
-
-#define DEFINE_DOUBLE_VFS_IO_EVENT(name) \
-DEFINE_EVENT(xfs_double_vfs_io_class, name, \
- TP_PROTO(struct inode *src, u64 soffset, u64 len, \
- struct inode *dest, u64 doffset), \
- TP_ARGS(src, soffset, len, dest, doffset))
-
-/* CoW write tracepoint */
-DECLARE_EVENT_CLASS(xfs_copy_on_write_class,
- TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t lblk, xfs_fsblock_t pblk,
- xfs_extlen_t len, xfs_fsblock_t new_pblk),
- TP_ARGS(ip, lblk, pblk, len, new_pblk),
- TP_STRUCT__entry(
- __field(dev_t, dev)
- __field(xfs_ino_t, ino)
- __field(xfs_fileoff_t, lblk)
- __field(xfs_fsblock_t, pblk)
- __field(xfs_extlen_t, len)
- __field(xfs_fsblock_t, new_pblk)
- ),
- TP_fast_assign(
- __entry->dev = VFS_I(ip)->i_sb->s_dev;
- __entry->ino = ip->i_ino;
- __entry->lblk = lblk;
- __entry->pblk = pblk;
- __entry->len = len;
- __entry->new_pblk = new_pblk;
- ),
- TP_printk("dev %d:%d ino 0x%llx lblk 0x%llx pblk 0x%llx "
- "len 0x%x new_pblk %llu",
- MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->ino,
- __entry->lblk,
- __entry->pblk,
- __entry->len,
- __entry->new_pblk)
-)
-
-#define DEFINE_COW_EVENT(name) \
-DEFINE_EVENT(xfs_copy_on_write_class, name, \
- TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t lblk, xfs_fsblock_t pblk, \
- xfs_extlen_t len, xfs_fsblock_t new_pblk), \
- TP_ARGS(ip, lblk, pblk, len, new_pblk))
-
/* inode/irec events */
DECLARE_EVENT_CLASS(xfs_inode_irec_class,
TP_PROTO(struct xfs_inode *ip, struct xfs_bmbt_irec *irec),
@@ -3292,8 +3198,6 @@ DEFINE_DOUBLE_IO_EVENT(xfs_reflink_remap_range);
DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_range_error);
DEFINE_INODE_ERROR_EVENT(xfs_reflink_set_inode_flag_error);
DEFINE_INODE_ERROR_EVENT(xfs_reflink_update_inode_size_error);
-DEFINE_INODE_ERROR_EVENT(xfs_reflink_reflink_main_loop_error);
-DEFINE_INODE_ERROR_EVENT(xfs_reflink_read_iomap_error);
DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_blocks_error);
DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_extent_error);
@@ -3302,9 +3206,6 @@ DEFINE_DOUBLE_IO_EVENT(xfs_reflink_compare_extents);
DEFINE_INODE_ERROR_EVENT(xfs_reflink_compare_extents_error);
/* ioctl tracepoints */
-DEFINE_DOUBLE_VFS_IO_EVENT(xfs_ioctl_reflink);
-DEFINE_DOUBLE_VFS_IO_EVENT(xfs_ioctl_clone_range);
-DEFINE_DOUBLE_VFS_IO_EVENT(xfs_ioctl_file_extent_same);
TRACE_EVENT(xfs_ioctl_clone,
TP_PROTO(struct inode *src, struct inode *dest),
TP_ARGS(src, dest),
@@ -3334,11 +3235,7 @@ TRACE_EVENT(xfs_ioctl_clone,
/* unshare tracepoints */
DEFINE_SIMPLE_IO_EVENT(xfs_reflink_unshare);
-DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cow_eof_block);
-DEFINE_PAGE_EVENT(xfs_reflink_unshare_page);
DEFINE_INODE_ERROR_EVENT(xfs_reflink_unshare_error);
-DEFINE_INODE_ERROR_EVENT(xfs_reflink_cow_eof_block_error);
-DEFINE_INODE_ERROR_EVENT(xfs_reflink_dirty_page_error);
/* copy on write */
DEFINE_INODE_IREC_EVENT(xfs_reflink_trim_around_shared);
@@ -3361,14 +3258,8 @@ DEFINE_INODE_ERROR_EVENT(xfs_reflink_allocate_cow_range_error);
DEFINE_INODE_ERROR_EVENT(xfs_reflink_cancel_cow_range_error);
DEFINE_INODE_ERROR_EVENT(xfs_reflink_end_cow_error);
-DEFINE_COW_EVENT(xfs_reflink_fork_buf);
-DEFINE_COW_EVENT(xfs_reflink_finish_fork_buf);
-DEFINE_INODE_ERROR_EVENT(xfs_reflink_fork_buf_error);
-DEFINE_INODE_ERROR_EVENT(xfs_reflink_finish_fork_buf_error);
-DEFINE_INODE_EVENT(xfs_reflink_cancel_pending_cow);
DEFINE_INODE_IREC_EVENT(xfs_reflink_cancel_cow);
-DEFINE_INODE_ERROR_EVENT(xfs_reflink_cancel_pending_cow_error);
/* rmap swapext tracepoints */
DEFINE_INODE_IREC_EVENT(xfs_swap_extent_rmap_remap);
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index 62900938f26d..0594db435972 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -130,7 +130,7 @@ const struct xattr_handler *xfs_xattr_handlers[] = {
NULL
};
-static int
+static void
__xfs_xattr_put_listent(
struct xfs_attr_list_context *context,
char *prefix,
@@ -148,7 +148,7 @@ __xfs_xattr_put_listent(
if (arraytop > context->firstu) {
context->count = -1; /* insufficient space */
context->seen_enough = 1;
- return 0;
+ return;
}
offset = (char *)context->alist + context->count;
strncpy(offset, prefix, prefix_len);
@@ -159,10 +159,10 @@ __xfs_xattr_put_listent(
compute_size:
context->count += prefix_len + namelen + 1;
- return 0;
+ return;
}
-static int
+static void
xfs_xattr_put_listent(
struct xfs_attr_list_context *context,
int flags,
@@ -180,23 +180,19 @@ xfs_xattr_put_listent(
if (namelen == SGI_ACL_FILE_SIZE &&
strncmp(name, SGI_ACL_FILE,
SGI_ACL_FILE_SIZE) == 0) {
- int ret = __xfs_xattr_put_listent(
+ __xfs_xattr_put_listent(
context, XATTR_SYSTEM_PREFIX,
XATTR_SYSTEM_PREFIX_LEN,
XATTR_POSIX_ACL_ACCESS,
strlen(XATTR_POSIX_ACL_ACCESS));
- if (ret)
- return ret;
} else if (namelen == SGI_ACL_DEFAULT_SIZE &&
strncmp(name, SGI_ACL_DEFAULT,
SGI_ACL_DEFAULT_SIZE) == 0) {
- int ret = __xfs_xattr_put_listent(
+ __xfs_xattr_put_listent(
context, XATTR_SYSTEM_PREFIX,
XATTR_SYSTEM_PREFIX_LEN,
XATTR_POSIX_ACL_DEFAULT,
strlen(XATTR_POSIX_ACL_DEFAULT));
- if (ret)
- return ret;
}
#endif
@@ -205,7 +201,7 @@ xfs_xattr_put_listent(
* see them.
*/
if (!capable(CAP_SYS_ADMIN))
- return 0;
+ return;
prefix = XATTR_TRUSTED_PREFIX;
prefix_len = XATTR_TRUSTED_PREFIX_LEN;
@@ -217,8 +213,9 @@ xfs_xattr_put_listent(
prefix_len = XATTR_USER_PREFIX_LEN;
}
- return __xfs_xattr_put_listent(context, prefix, prefix_len, name,
- namelen);
+ __xfs_xattr_put_listent(context, prefix, prefix_len, name,
+ namelen);
+ return;
}
ssize_t