summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/Kconfig15
-rw-r--r--fs/adfs/inode.c1
-rw-r--r--fs/affs/file.c2
-rw-r--r--fs/afs/cmservice.c5
-rw-r--r--fs/afs/dir.c4
-rw-r--r--fs/afs/fsclient.c4
-rw-r--r--fs/afs/main.c4
-rw-r--r--fs/afs/vlclient.c1
-rw-r--r--fs/afs/write.c26
-rw-r--r--fs/bfs/file.c1
-rw-r--r--fs/binfmt_aout.c4
-rw-r--r--fs/binfmt_elf.c10
-rw-r--r--fs/binfmt_elf_fdpic.c15
-rw-r--r--fs/binfmt_flat.c2
-rw-r--r--fs/block_dev.c268
-rw-r--r--fs/btrfs/Kconfig2
-rw-r--r--fs/btrfs/backref.c2
-rw-r--r--fs/btrfs/block-group.c39
-rw-r--r--fs/btrfs/compression.c82
-rw-r--r--fs/btrfs/compression.h26
-rw-r--r--fs/btrfs/ctree.c5
-rw-r--r--fs/btrfs/ctree.h120
-rw-r--r--fs/btrfs/delalloc-space.c2
-rw-r--r--fs/btrfs/delayed-inode.c41
-rw-r--r--fs/btrfs/delayed-ref.c26
-rw-r--r--fs/btrfs/dev-replace.c2
-rw-r--r--fs/btrfs/discard.c2
-rw-r--r--fs/btrfs/disk-io.c81
-rw-r--r--fs/btrfs/extent-tree.c20
-rw-r--r--fs/btrfs/extent_io.c975
-rw-r--r--fs/btrfs/extent_io.h29
-rw-r--r--fs/btrfs/file-item.c110
-rw-r--r--fs/btrfs/file.c48
-rw-r--r--fs/btrfs/free-space-cache.c2
-rw-r--r--fs/btrfs/inode.c538
-rw-r--r--fs/btrfs/ioctl.c184
-rw-r--r--fs/btrfs/locking.c4
-rw-r--r--fs/btrfs/ordered-data.c253
-rw-r--r--fs/btrfs/ordered-data.h10
-rw-r--r--fs/btrfs/props.c16
-rw-r--r--fs/btrfs/qgroup.c10
-rw-r--r--fs/btrfs/reflink.c47
-rw-r--r--fs/btrfs/relocation.c75
-rw-r--r--fs/btrfs/scrub.c159
-rw-r--r--fs/btrfs/send.c47
-rw-r--r--fs/btrfs/space-info.c233
-rw-r--r--fs/btrfs/space-info.h30
-rw-r--r--fs/btrfs/subpage.c155
-rw-r--r--fs/btrfs/subpage.h33
-rw-r--r--fs/btrfs/super.c16
-rw-r--r--fs/btrfs/sysfs.c74
-rw-r--r--fs/btrfs/tests/extent-map-tests.c2
-rw-r--r--fs/btrfs/transaction.c61
-rw-r--r--fs/btrfs/transaction.h6
-rw-r--r--fs/btrfs/tree-log.c61
-rw-r--r--fs/btrfs/volumes.c26
-rw-r--r--fs/btrfs/volumes.h5
-rw-r--r--fs/btrfs/zoned.c70
-rw-r--r--fs/btrfs/zoned.h14
-rw-r--r--fs/buffer.c25
-rw-r--r--fs/ceph/dir.c22
-rw-r--r--fs/ceph/file.c17
-rw-r--r--fs/ceph/inode.c2
-rw-r--r--fs/ceph/super.h2
-rw-r--r--fs/cifs/Kconfig6
-rw-r--r--fs/cifs/Makefile8
-rw-r--r--fs/cifs/asn1.c623
-rw-r--r--fs/cifs/cache.c14
-rw-r--r--fs/cifs/cifs_debug.c24
-rw-r--r--fs/cifs/cifs_debug.h2
-rw-r--r--fs/cifs/cifs_dfs_ref.c2
-rw-r--r--fs/cifs/cifs_fs_sb.h18
-rw-r--r--fs/cifs/cifs_ioctl.h36
-rw-r--r--fs/cifs/cifs_spnego.c14
-rw-r--r--fs/cifs/cifs_spnego.h14
-rw-r--r--fs/cifs/cifs_spnego_negtokeninit.asn140
-rw-r--r--fs/cifs/cifs_swn.c10
-rw-r--r--fs/cifs/cifsacl.c18
-rw-r--r--fs/cifs/cifsacl.h15
-rw-r--r--fs/cifs/cifsencrypt.c14
-rw-r--r--fs/cifs/cifsfs.c16
-rw-r--r--fs/cifs/cifsfs.h14
-rw-r--r--fs/cifs/cifsglob.h24
-rw-r--r--fs/cifs/cifspdu.h17
-rw-r--r--fs/cifs/cifsproto.h14
-rw-r--r--fs/cifs/cifssmb.c16
-rw-r--r--fs/cifs/connect.c159
-rw-r--r--fs/cifs/dfs_cache.c1079
-rw-r--r--fs/cifs/dfs_cache.h45
-rw-r--r--fs/cifs/dir.c27
-rw-r--r--fs/cifs/dns_resolve.c14
-rw-r--r--fs/cifs/dns_resolve.h14
-rw-r--r--fs/cifs/export.c14
-rw-r--r--fs/cifs/file.c60
-rw-r--r--fs/cifs/fs_context.c2
-rw-r--r--fs/cifs/fscache.c14
-rw-r--r--fs/cifs/fscache.h14
-rw-r--r--fs/cifs/inode.c19
-rw-r--r--fs/cifs/ioctl.c157
-rw-r--r--fs/cifs/link.c14
-rw-r--r--fs/cifs/misc.c37
-rw-r--r--fs/cifs/netlink.c2
-rw-r--r--fs/cifs/ntlmssp.h14
-rw-r--r--fs/cifs/readdir.c16
-rw-r--r--fs/cifs/rfc1002pdu.h14
-rw-r--r--fs/cifs/sess.c16
-rw-r--r--fs/cifs/smb2file.c14
-rw-r--r--fs/cifs/smb2glob.h11
-rw-r--r--fs/cifs/smb2inode.c14
-rw-r--r--fs/cifs/smb2maperror.c14
-rw-r--r--fs/cifs/smb2misc.c52
-rw-r--r--fs/cifs/smb2ops.c144
-rw-r--r--fs/cifs/smb2pdu.c62
-rw-r--r--fs/cifs/smb2pdu.h53
-rw-r--r--fs/cifs/smb2proto.h16
-rw-r--r--fs/cifs/smb2status.h14
-rw-r--r--fs/cifs/smb2transport.c26
-rw-r--r--fs/cifs/smbdirect.c14
-rw-r--r--fs/cifs/smberr.h14
-rw-r--r--fs/cifs/smbfsctl.h14
-rw-r--r--fs/cifs/trace.h29
-rw-r--r--fs/cifs/transport.c14
-rw-r--r--fs/cifs/xattr.c14
-rw-r--r--fs/configfs/file.c181
-rw-r--r--fs/configfs/inode.c8
-rw-r--r--fs/coredump.c2
-rw-r--r--fs/crypto/fname.c10
-rw-r--r--fs/crypto/keysetup.c40
-rw-r--r--fs/dax.c3
-rw-r--r--fs/debugfs/file.c2
-rw-r--r--fs/debugfs/inode.c9
-rw-r--r--fs/dlm/config.c18
-rw-r--r--fs/dlm/config.h5
-rw-r--r--fs/dlm/debug_fs.c54
-rw-r--r--fs/dlm/dlm_internal.h42
-rw-r--r--fs/dlm/lock.c16
-rw-r--r--fs/dlm/lockspace.c14
-rw-r--r--fs/dlm/lowcomms.c411
-rw-r--r--fs/dlm/lowcomms.h25
-rw-r--r--fs/dlm/member.c37
-rw-r--r--fs/dlm/midcomms.c1343
-rw-r--r--fs/dlm/midcomms.h15
-rw-r--r--fs/dlm/rcom.c123
-rw-r--r--fs/dlm/util.c10
-rw-r--r--fs/dlm/util.h2
-rw-r--r--fs/ecryptfs/mmap.c13
-rw-r--r--fs/erofs/Kconfig1
-rw-r--r--fs/erofs/compress.h2
-rw-r--r--fs/erofs/data.c2
-rw-r--r--fs/erofs/decompressor.c2
-rw-r--r--fs/erofs/dir.c2
-rw-r--r--fs/erofs/erofs_fs.h2
-rw-r--r--fs/erofs/inode.c2
-rw-r--r--fs/erofs/internal.h2
-rw-r--r--fs/erofs/namei.c2
-rw-r--r--fs/erofs/super.c3
-rw-r--r--fs/erofs/tagptr.h3
-rw-r--r--fs/erofs/utils.c2
-rw-r--r--fs/erofs/xattr.c2
-rw-r--r--fs/erofs/xattr.h1
-rw-r--r--fs/erofs/zdata.c6
-rw-r--r--fs/erofs/zdata.h1
-rw-r--r--fs/erofs/zmap.c2
-rw-r--r--fs/erofs/zpvec.h7
-rw-r--r--fs/exec.c9
-rw-r--r--fs/exfat/inode.c1
-rw-r--r--fs/ext2/inode.c4
-rw-r--r--fs/ext4/ext4.h18
-rw-r--r--fs/ext4/extents.c47
-rw-r--r--fs/ext4/extents_status.c4
-rw-r--r--fs/ext4/fast_commit.c170
-rw-r--r--fs/ext4/fast_commit.h19
-rw-r--r--fs/ext4/fsmap.h4
-rw-r--r--fs/ext4/ialloc.c17
-rw-r--r--fs/ext4/inline.c11
-rw-r--r--fs/ext4/inode.c10
-rw-r--r--fs/ext4/ioctl.c80
-rw-r--r--fs/ext4/mballoc.c2
-rw-r--r--fs/ext4/mmp.c28
-rw-r--r--fs/ext4/namei.c8
-rw-r--r--fs/ext4/resize.c9
-rw-r--r--fs/ext4/super.c68
-rw-r--r--fs/ext4/sysfs.c9
-rw-r--r--fs/ext4/xattr.c26
-rw-r--r--fs/ext4/xattr.h6
-rw-r--r--fs/fat/inode.c1
-rw-r--r--fs/fs-writeback.c366
-rw-r--r--fs/fuse/dax.c3
-rw-r--r--fs/gfs2/aops.c2
-rw-r--r--fs/gfs2/bmap.c62
-rw-r--r--fs/gfs2/bmap.h2
-rw-r--r--fs/gfs2/dir.c2
-rw-r--r--fs/gfs2/file.c84
-rw-r--r--fs/gfs2/glock.c31
-rw-r--r--fs/gfs2/glops.c2
-rw-r--r--fs/gfs2/log.c6
-rw-r--r--fs/gfs2/log.h1
-rw-r--r--fs/gfs2/lops.c7
-rw-r--r--fs/gfs2/lops.h1
-rw-r--r--fs/gfs2/meta_io.c2
-rw-r--r--fs/gfs2/ops_fstype.c1
-rw-r--r--fs/gfs2/quota.c2
-rw-r--r--fs/gfs2/util.c1
-rw-r--r--fs/hfs/inode.c2
-rw-r--r--fs/hfsplus/inode.c7
-rw-r--r--fs/hfsplus/xattr.c1
-rw-r--r--fs/hpfs/file.c1
-rw-r--r--fs/hugetlbfs/inode.c19
-rw-r--r--fs/io-wq.c128
-rw-r--r--fs/io-wq.h5
-rw-r--r--fs/io_uring.c1380
-rw-r--r--fs/iomap/buffered-io.c27
-rw-r--r--fs/isofs/dir.c2
-rw-r--r--fs/jbd2/checkpoint.c206
-rw-r--r--fs/jbd2/journal.c230
-rw-r--r--fs/jbd2/transaction.c17
-rw-r--r--fs/jfs/inode.c1
-rw-r--r--fs/kernfs/inode.c8
-rw-r--r--fs/libfs.c44
-rw-r--r--fs/minix/inode.c1
-rw-r--r--fs/namespace.c9
-rw-r--r--fs/netfs/Kconfig2
-rw-r--r--fs/netfs/read_helper.c51
-rw-r--r--fs/nfs/client.c2
-rw-r--r--fs/nfs/filelayout/filelayout.c2
-rw-r--r--fs/nfs/namespace.c2
-rw-r--r--fs/nfs/nfs4_fs.h1
-rw-r--r--fs/nfs/nfs4client.c2
-rw-r--r--fs/nfs/nfs4file.c2
-rw-r--r--fs/nfs/nfs4proc.c41
-rw-r--r--fs/nfs/nfstrace.h4
-rw-r--r--fs/nfs/pagelist.c20
-rw-r--r--fs/nfs/pnfs.c17
-rw-r--r--fs/nfs/super.c2
-rw-r--r--fs/nfsd/nfs4state.c2
-rw-r--r--fs/nilfs2/btree.c1
-rw-r--r--fs/nilfs2/mdt.c1
-rw-r--r--fs/nilfs2/sysfs.c20
-rw-r--r--fs/notify/fanotify/fanotify_user.c34
-rw-r--r--fs/notify/fdinfo.c2
-rw-r--r--fs/ntfs/inode.c2
-rw-r--r--fs/ocfs2/alloc.c2
-rw-r--r--fs/ocfs2/aops.c4
-rw-r--r--fs/ocfs2/cluster/heartbeat.c7
-rw-r--r--fs/ocfs2/cluster/nodemanager.c2
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c2
-rw-r--r--fs/ocfs2/file.c55
-rw-r--r--fs/ocfs2/filecheck.c6
-rw-r--r--fs/ocfs2/journal.c8
-rw-r--r--fs/ocfs2/stackglue.c8
-rw-r--r--fs/omfs/file.c1
-rw-r--r--fs/open.c27
-rw-r--r--fs/proc/array.c2
-rw-r--r--fs/proc/base.c19
-rw-r--r--fs/proc/fd.c20
-rw-r--r--fs/proc/kcore.c67
-rw-r--r--fs/proc/loadavg.c2
-rw-r--r--fs/proc/stat.c4
-rw-r--r--fs/proc/task_mmu.c36
-rw-r--r--fs/pstore/Kconfig1
-rw-r--r--fs/pstore/blk.c403
-rw-r--r--fs/quota/quota.c28
-rw-r--r--fs/quota/quota_tree.c33
-rw-r--r--fs/ramfs/inode.c9
-rw-r--r--fs/reiserfs/inode.c4
-rw-r--r--fs/reiserfs/journal.c14
-rw-r--r--fs/reiserfs/namei.c1
-rw-r--r--fs/seq_file.c43
-rw-r--r--fs/signalfd.c23
-rw-r--r--fs/squashfs/block.c5
-rw-r--r--fs/squashfs/squashfs_fs_sb.h1
-rw-r--r--fs/squashfs/super.c86
-rw-r--r--fs/super.c8
-rw-r--r--fs/sysv/itree.c1
-rw-r--r--fs/udf/file.c1
-rw-r--r--fs/udf/inode.c1
-rw-r--r--fs/udf/namei.c4
-rw-r--r--fs/ufs/inode.c1
-rw-r--r--fs/userfaultfd.c19
-rw-r--r--fs/xfs/libxfs/xfs_ag_resv.c22
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c2
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c12
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.c2
-rw-r--r--fs/xfs/libxfs/xfs_fs.h4
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.c46
-rw-r--r--fs/xfs/libxfs/xfs_trans_inode.c17
-rw-r--r--fs/xfs/scrub/agheader.c1
-rw-r--r--fs/xfs/scrub/bmap.c2
-rw-r--r--fs/xfs/scrub/btree.c2
-rw-r--r--fs/xfs/scrub/common.c10
-rw-r--r--fs/xfs/scrub/dabtree.c2
-rw-r--r--fs/xfs/scrub/repair.c2
-rw-r--r--fs/xfs/xfs_aops.c4
-rw-r--r--fs/xfs/xfs_bmap_util.c100
-rw-r--r--fs/xfs/xfs_export.c4
-rw-r--r--fs/xfs/xfs_file.c2
-rw-r--r--fs/xfs/xfs_inode.c31
-rw-r--r--fs/xfs/xfs_ioctl.c105
-rw-r--r--fs/xfs/xfs_iomap.c2
-rw-r--r--fs/xfs/xfs_log.c1
-rw-r--r--fs/xfs/xfs_message.h15
-rw-r--r--fs/xfs/xfs_trans_buf.c2
-rw-r--r--fs/zonefs/super.c4
303 files changed, 8526 insertions, 6059 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index 141a856c50e7..a7749c126b8e 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -240,6 +240,21 @@ config HUGETLBFS
config HUGETLB_PAGE
def_bool HUGETLBFS
+config HUGETLB_PAGE_FREE_VMEMMAP
+ def_bool HUGETLB_PAGE
+ depends on X86_64
+ depends on SPARSEMEM_VMEMMAP
+
+config HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON
+ bool "Default freeing vmemmap pages of HugeTLB to on"
+ default n
+ depends on HUGETLB_PAGE_FREE_VMEMMAP
+ help
+ When using HUGETLB_PAGE_FREE_VMEMMAP, the freeing unused vmemmap
+ pages associated with each HugeTLB page is default off. Say Y here
+ to enable freeing vmemmap pages of HugeTLB by default. It can then
+ be disabled on the command line via hugetlb_free_vmemmap=off.
+
config MEMFD_CREATE
def_bool TMPFS || HUGETLBFS
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index fb7ee026d101..adbb3a1edcbf 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -73,6 +73,7 @@ static sector_t _adfs_bmap(struct address_space *mapping, sector_t block)
}
static const struct address_space_operations adfs_aops = {
+ .set_page_dirty = __set_page_dirty_buffers,
.readpage = adfs_readpage,
.writepage = adfs_writepage,
.write_begin = adfs_write_begin,
diff --git a/fs/affs/file.c b/fs/affs/file.c
index d91b0133d95d..75ebd2b576ca 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -453,6 +453,7 @@ static sector_t _affs_bmap(struct address_space *mapping, sector_t block)
}
const struct address_space_operations affs_aops = {
+ .set_page_dirty = __set_page_dirty_buffers,
.readpage = affs_readpage,
.writepage = affs_writepage,
.write_begin = affs_write_begin,
@@ -833,6 +834,7 @@ err_bh:
}
const struct address_space_operations affs_aops_ofs = {
+ .set_page_dirty = __set_page_dirty_buffers,
.readpage = affs_readpage_ofs,
//.writepage = affs_writepage_ofs,
.write_begin = affs_write_begin_ofs,
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index a4e9e6e07e93..d3c6bb22c5f4 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -322,6 +322,8 @@ static int afs_deliver_cb_callback(struct afs_call *call)
return ret;
call->unmarshall++;
+ fallthrough;
+
case 5:
break;
}
@@ -418,6 +420,7 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call)
r->node[loop] = ntohl(b[loop + 5]);
call->unmarshall++;
+ fallthrough;
case 2:
break;
@@ -530,6 +533,7 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call)
r->node[loop] = ntohl(b[loop + 5]);
call->unmarshall++;
+ fallthrough;
case 2:
break;
@@ -663,6 +667,7 @@ static int afs_deliver_yfs_cb_callback(struct afs_call *call)
afs_extract_to_tmp(call);
call->unmarshall++;
+ fallthrough;
case 3:
break;
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 9fbe5a5ec9bd..78719f2f567e 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -1919,7 +1919,9 @@ static void afs_rename_edit_dir(struct afs_operation *op)
new_inode = d_inode(new_dentry);
if (new_inode) {
spin_lock(&new_inode->i_lock);
- if (new_inode->i_nlink > 0)
+ if (S_ISDIR(new_inode->i_mode))
+ clear_nlink(new_inode);
+ else if (new_inode->i_nlink > 0)
drop_nlink(new_inode);
spin_unlock(&new_inode->i_lock);
}
diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index 2f695a260442..dd3f45d906d2 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -388,6 +388,7 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call)
req->file_size = vp->scb.status.size;
call->unmarshall++;
+ fallthrough;
case 5:
break;
@@ -1408,6 +1409,7 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call)
_debug("motd '%s'", p);
call->unmarshall++;
+ fallthrough;
case 8:
break;
@@ -1845,6 +1847,7 @@ static int afs_deliver_fs_inline_bulk_status(struct afs_call *call)
xdr_decode_AFSVolSync(&bp, &op->volsync);
call->unmarshall++;
+ fallthrough;
case 6:
break;
@@ -1979,6 +1982,7 @@ static int afs_deliver_fs_fetch_acl(struct afs_call *call)
xdr_decode_AFSVolSync(&bp, &op->volsync);
call->unmarshall++;
+ fallthrough;
case 4:
break;
diff --git a/fs/afs/main.c b/fs/afs/main.c
index b2975256dadb..179004b15566 100644
--- a/fs/afs/main.c
+++ b/fs/afs/main.c
@@ -203,8 +203,8 @@ static int __init afs_init(void)
goto error_fs;
afs_proc_symlink = proc_symlink("fs/afs", NULL, "../self/net/afs");
- if (IS_ERR(afs_proc_symlink)) {
- ret = PTR_ERR(afs_proc_symlink);
+ if (!afs_proc_symlink) {
+ ret = -ENOMEM;
goto error_proc;
}
diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c
index dc9327332f06..00fca3c66ba6 100644
--- a/fs/afs/vlclient.c
+++ b/fs/afs/vlclient.c
@@ -593,6 +593,7 @@ static int afs_deliver_yfsvl_get_endpoints(struct afs_call *call)
if (ret < 0)
return ret;
call->unmarshall = 6;
+ fallthrough;
case 6:
break;
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 3edb6204b937..3104b62c2082 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -118,6 +118,15 @@ int afs_write_end(struct file *file, struct address_space *mapping,
_enter("{%llx:%llu},{%lx}",
vnode->fid.vid, vnode->fid.vnode, page->index);
+ if (!PageUptodate(page)) {
+ if (copied < len) {
+ copied = 0;
+ goto out;
+ }
+
+ SetPageUptodate(page);
+ }
+
if (copied == 0)
goto out;
@@ -132,8 +141,6 @@ int afs_write_end(struct file *file, struct address_space *mapping,
write_sequnlock(&vnode->cb_lock);
}
- ASSERT(PageUptodate(page));
-
if (PagePrivate(page)) {
priv = page_private(page);
f = afs_page_dirty_from(page, priv);
@@ -730,7 +737,7 @@ static int afs_writepages_region(struct address_space *mapping,
return ret;
}
- start += ret * PAGE_SIZE;
+ start += ret;
cond_resched();
} while (wbc->nr_to_write > 0);
@@ -837,6 +844,7 @@ vm_fault_t afs_page_mkwrite(struct vm_fault *vmf)
struct inode *inode = file_inode(file);
struct afs_vnode *vnode = AFS_FS_I(inode);
unsigned long priv;
+ vm_fault_t ret = VM_FAULT_RETRY;
_enter("{{%llx:%llu}},{%lx}", vnode->fid.vid, vnode->fid.vnode, page->index);
@@ -848,14 +856,14 @@ vm_fault_t afs_page_mkwrite(struct vm_fault *vmf)
#ifdef CONFIG_AFS_FSCACHE
if (PageFsCache(page) &&
wait_on_page_fscache_killable(page) < 0)
- return VM_FAULT_RETRY;
+ goto out;
#endif
if (wait_on_page_writeback_killable(page))
- return VM_FAULT_RETRY;
+ goto out;
if (lock_page_killable(page) < 0)
- return VM_FAULT_RETRY;
+ goto out;
/* We mustn't change page->private until writeback is complete as that
* details the portion of the page we need to write back and we might
@@ -863,7 +871,7 @@ vm_fault_t afs_page_mkwrite(struct vm_fault *vmf)
*/
if (wait_on_page_writeback_killable(page) < 0) {
unlock_page(page);
- return VM_FAULT_RETRY;
+ goto out;
}
priv = afs_page_dirty(page, 0, thp_size(page));
@@ -877,8 +885,10 @@ vm_fault_t afs_page_mkwrite(struct vm_fault *vmf)
}
file_update_time(file);
+ ret = VM_FAULT_LOCKED;
+out:
sb_end_pagefault(inode->i_sb);
- return VM_FAULT_LOCKED;
+ return ret;
}
/*
diff --git a/fs/bfs/file.c b/fs/bfs/file.c
index 0dceefc54b48..7f8544abf636 100644
--- a/fs/bfs/file.c
+++ b/fs/bfs/file.c
@@ -188,6 +188,7 @@ static sector_t bfs_bmap(struct address_space *mapping, sector_t block)
}
const struct address_space_operations bfs_aops = {
+ .set_page_dirty = __set_page_dirty_buffers,
.readpage = bfs_readpage,
.writepage = bfs_writepage,
.write_begin = bfs_write_begin,
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index 3e84e9bb9084..145917f734fe 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -222,7 +222,7 @@ static int load_aout_binary(struct linux_binprm * bprm)
error = vm_mmap(bprm->file, N_TXTADDR(ex), ex.a_text,
PROT_READ | PROT_EXEC,
- MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE,
+ MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE,
fd_offset);
if (error != N_TXTADDR(ex))
@@ -230,7 +230,7 @@ static int load_aout_binary(struct linux_binprm * bprm)
error = vm_mmap(bprm->file, N_DATADDR(ex), ex.a_data,
PROT_READ | PROT_WRITE | PROT_EXEC,
- MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE,
+ MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE,
fd_offset + ex.a_text);
if (error != N_DATADDR(ex))
return error;
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 187b3f2b9202..439ed81e755a 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1070,7 +1070,7 @@ out_free_interp:
elf_prot = make_prot(elf_ppnt->p_flags, &arch_state,
!!interpreter, false);
- elf_flags = MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE;
+ elf_flags = MAP_PRIVATE | MAP_DENYWRITE;
vaddr = elf_ppnt->p_vaddr;
/*
@@ -1537,7 +1537,8 @@ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p,
{
const struct cred *cred;
unsigned int i, len;
-
+ unsigned int state;
+
/* first copy the parameters from user space */
memset(psinfo, 0, sizeof(struct elf_prpsinfo));
@@ -1559,7 +1560,8 @@ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p,
psinfo->pr_pgrp = task_pgrp_vnr(p);
psinfo->pr_sid = task_session_vnr(p);
- i = p->state ? ffz(~p->state) + 1 : 0;
+ state = READ_ONCE(p->__state);
+ i = state ? ffz(~state) + 1 : 0;
psinfo->pr_state = i;
psinfo->pr_sname = (i > 5) ? '.' : "RSDTZW"[i];
psinfo->pr_zomb = psinfo->pr_sname == 'Z';
@@ -1571,7 +1573,7 @@ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p,
SET_GID(psinfo->pr_gid, from_kgid_munged(cred->user_ns, cred->gid));
rcu_read_unlock();
strncpy(psinfo->pr_fname, p->comm, sizeof(psinfo->pr_fname));
-
+
return 0;
}
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 2c99b102c860..cf4028487dcc 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -928,7 +928,7 @@ static int elf_fdpic_map_file_constdisp_on_uclinux(
{
struct elf32_fdpic_loadseg *seg;
struct elf32_phdr *phdr;
- unsigned long load_addr, base = ULONG_MAX, top = 0, maddr = 0, mflags;
+ unsigned long load_addr, base = ULONG_MAX, top = 0, maddr = 0;
int loop, ret;
load_addr = params->load_addr;
@@ -948,12 +948,8 @@ static int elf_fdpic_map_file_constdisp_on_uclinux(
}
/* allocate one big anon block for everything */
- mflags = MAP_PRIVATE;
- if (params->flags & ELF_FDPIC_FLAG_EXECUTABLE)
- mflags |= MAP_EXECUTABLE;
-
maddr = vm_mmap(NULL, load_addr, top - base,
- PROT_READ | PROT_WRITE | PROT_EXEC, mflags, 0);
+ PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE, 0);
if (IS_ERR_VALUE(maddr))
return (int) maddr;
@@ -1046,9 +1042,6 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
if (phdr->p_flags & PF_X) prot |= PROT_EXEC;
flags = MAP_PRIVATE | MAP_DENYWRITE;
- if (params->flags & ELF_FDPIC_FLAG_EXECUTABLE)
- flags |= MAP_EXECUTABLE;
-
maddr = 0;
switch (params->flags & ELF_FDPIC_FLAG_ARRANGEMENT) {
@@ -1331,6 +1324,7 @@ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p,
{
const struct cred *cred;
unsigned int i, len;
+ unsigned int state;
/* first copy the parameters from user space */
memset(psinfo, 0, sizeof(struct elf_prpsinfo));
@@ -1353,7 +1347,8 @@ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p,
psinfo->pr_pgrp = task_pgrp_vnr(p);
psinfo->pr_sid = task_session_vnr(p);
- i = p->state ? ffz(~p->state) + 1 : 0;
+ state = READ_ONCE(p->__state);
+ i = state ? ffz(~state) + 1 : 0;
psinfo->pr_state = i;
psinfo->pr_sname = (i > 5) ? '.' : "RSDTZW"[i];
psinfo->pr_zomb = psinfo->pr_sname == 'Z';
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index a1072c6a2341..5d776f80ee50 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -573,7 +573,7 @@ static int load_flat_file(struct linux_binprm *bprm,
pr_debug("ROM mapping of file (we hope)\n");
textpos = vm_mmap(bprm->file, 0, text_len, PROT_READ|PROT_EXEC,
- MAP_PRIVATE|MAP_EXECUTABLE, 0);
+ MAP_PRIVATE, 0);
if (!textpos || IS_ERR_VALUE(textpos)) {
ret = textpos;
if (!textpos)
diff --git a/fs/block_dev.c b/fs/block_dev.c
index b8abccd03e5d..7e83c3e71504 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -895,7 +895,6 @@ struct block_device *bdev_alloc(struct gendisk *disk, u8 partno)
mapping_set_gfp_mask(&inode->i_data, GFP_USER);
bdev = I_BDEV(inode);
- mutex_init(&bdev->bd_mutex);
mutex_init(&bdev->bd_fsfreeze_mutex);
spin_lock_init(&bdev->bd_size_lock);
bdev->bd_disk = disk;
@@ -1154,7 +1153,7 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk)
struct bd_holder_disk *holder;
int ret = 0;
- mutex_lock(&bdev->bd_mutex);
+ mutex_lock(&bdev->bd_disk->open_mutex);
WARN_ON_ONCE(!bdev->bd_holder);
@@ -1199,7 +1198,7 @@ out_del:
out_free:
kfree(holder);
out_unlock:
- mutex_unlock(&bdev->bd_mutex);
+ mutex_unlock(&bdev->bd_disk->open_mutex);
return ret;
}
EXPORT_SYMBOL_GPL(bd_link_disk_holder);
@@ -1218,7 +1217,7 @@ void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk)
{
struct bd_holder_disk *holder;
- mutex_lock(&bdev->bd_mutex);
+ mutex_lock(&bdev->bd_disk->open_mutex);
holder = bd_find_holder_disk(bdev, disk);
@@ -1230,133 +1229,97 @@ void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk)
kfree(holder);
}
- mutex_unlock(&bdev->bd_mutex);
+ mutex_unlock(&bdev->bd_disk->open_mutex);
}
EXPORT_SYMBOL_GPL(bd_unlink_disk_holder);
#endif
-static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part);
+static void blkdev_flush_mapping(struct block_device *bdev)
+{
+ WARN_ON_ONCE(bdev->bd_holders);
+ sync_blockdev(bdev);
+ kill_bdev(bdev);
+ bdev_write_inode(bdev);
+}
-int bdev_disk_changed(struct block_device *bdev, bool invalidate)
+static int blkdev_get_whole(struct block_device *bdev, fmode_t mode)
{
struct gendisk *disk = bdev->bd_disk;
int ret = 0;
- lockdep_assert_held(&bdev->bd_mutex);
-
-rescan:
- if (bdev->bd_part_count)
- return -EBUSY;
- sync_blockdev(bdev);
- invalidate_bdev(bdev);
- blk_drop_partitions(disk);
-
- clear_bit(GD_NEED_PART_SCAN, &disk->state);
-
- /*
- * Historically we only set the capacity to zero for devices that
- * support partitions (independ of actually having partitions created).
- * Doing that is rather inconsistent, but changing it broke legacy
- * udisks polling for legacy ide-cdrom devices. Use the crude check
- * below to get the sane behavior for most device while not breaking
- * userspace for this particular setup.
- */
- if (invalidate) {
- if (disk_part_scan_enabled(disk) ||
- !(disk->flags & GENHD_FL_REMOVABLE))
- set_capacity(disk, 0);
+ if (disk->fops->open) {
+ ret = disk->fops->open(bdev, mode);
+ if (ret) {
+ /* avoid ghost partitions on a removed medium */
+ if (ret == -ENOMEDIUM &&
+ test_bit(GD_NEED_PART_SCAN, &disk->state))
+ bdev_disk_changed(disk, true);
+ return ret;
+ }
}
- if (get_capacity(disk)) {
- ret = blk_add_partitions(disk, bdev);
- if (ret == -EAGAIN)
- goto rescan;
- } else if (invalidate) {
- /*
- * Tell userspace that the media / partition table may have
- * changed.
- */
- kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE);
+ if (!bdev->bd_openers) {
+ set_init_blocksize(bdev);
+ if (bdev->bd_bdi == &noop_backing_dev_info)
+ bdev->bd_bdi = bdi_get(disk->queue->backing_dev_info);
}
-
- return ret;
+ if (test_bit(GD_NEED_PART_SCAN, &disk->state))
+ bdev_disk_changed(disk, false);
+ bdev->bd_openers++;
+ return 0;;
}
-/*
- * Only exported for loop and dasd for historic reasons. Don't use in new
- * code!
- */
-EXPORT_SYMBOL_GPL(bdev_disk_changed);
-/*
- * bd_mutex locking:
- *
- * mutex_lock(part->bd_mutex)
- * mutex_lock_nested(whole->bd_mutex, 1)
- */
-static int __blkdev_get(struct block_device *bdev, fmode_t mode)
+static void blkdev_put_whole(struct block_device *bdev, fmode_t mode)
{
- struct gendisk *disk = bdev->bd_disk;
- int ret = 0;
+ if (!--bdev->bd_openers)
+ blkdev_flush_mapping(bdev);
+ if (bdev->bd_disk->fops->release)
+ bdev->bd_disk->fops->release(bdev->bd_disk, mode);
+}
- if (!bdev->bd_openers) {
- if (!bdev_is_partition(bdev)) {
- ret = 0;
- if (disk->fops->open)
- ret = disk->fops->open(bdev, mode);
+static int blkdev_get_part(struct block_device *part, fmode_t mode)
+{
+ struct gendisk *disk = part->bd_disk;
+ struct block_device *whole;
+ int ret;
- if (!ret)
- set_init_blocksize(bdev);
+ if (part->bd_openers)
+ goto done;
- /*
- * If the device is invalidated, rescan partition
- * if open succeeded or failed with -ENOMEDIUM.
- * The latter is necessary to prevent ghost
- * partitions on a removed medium.
- */
- if (test_bit(GD_NEED_PART_SCAN, &disk->state) &&
- (!ret || ret == -ENOMEDIUM))
- bdev_disk_changed(bdev, ret == -ENOMEDIUM);
+ whole = bdgrab(disk->part0);
+ ret = blkdev_get_whole(whole, mode);
+ if (ret)
+ goto out_put_whole;
- if (ret)
- return ret;
- } else {
- struct block_device *whole = bdgrab(disk->part0);
-
- mutex_lock_nested(&whole->bd_mutex, 1);
- ret = __blkdev_get(whole, mode);
- if (ret) {
- mutex_unlock(&whole->bd_mutex);
- bdput(whole);
- return ret;
- }
- whole->bd_part_count++;
- mutex_unlock(&whole->bd_mutex);
-
- if (!(disk->flags & GENHD_FL_UP) ||
- !bdev_nr_sectors(bdev)) {
- __blkdev_put(whole, mode, 1);
- bdput(whole);
- return -ENXIO;
- }
- set_init_blocksize(bdev);
- }
+ ret = -ENXIO;
+ if (!bdev_nr_sectors(part))
+ goto out_blkdev_put;
- if (bdev->bd_bdi == &noop_backing_dev_info)
- bdev->bd_bdi = bdi_get(disk->queue->backing_dev_info);
- } else {
- if (!bdev_is_partition(bdev)) {
- if (bdev->bd_disk->fops->open)
- ret = bdev->bd_disk->fops->open(bdev, mode);
- /* the same as first opener case, read comment there */
- if (test_bit(GD_NEED_PART_SCAN, &disk->state) &&
- (!ret || ret == -ENOMEDIUM))
- bdev_disk_changed(bdev, ret == -ENOMEDIUM);
- if (ret)
- return ret;
- }
- }
- bdev->bd_openers++;
+ disk->open_partitions++;
+ set_init_blocksize(part);
+ if (part->bd_bdi == &noop_backing_dev_info)
+ part->bd_bdi = bdi_get(disk->queue->backing_dev_info);
+done:
+ part->bd_openers++;
return 0;
+
+out_blkdev_put:
+ blkdev_put_whole(whole, mode);
+out_put_whole:
+ bdput(whole);
+ return ret;
+}
+
+static void blkdev_put_part(struct block_device *part, fmode_t mode)
+{
+ struct block_device *whole = bdev_whole(part);
+
+ if (--part->bd_openers)
+ return;
+ blkdev_flush_mapping(part);
+ whole->bd_disk->open_partitions--;
+ blkdev_put_whole(whole, mode);
+ bdput(whole);
}
struct block_device *blkdev_get_no_open(dev_t dev)
@@ -1364,16 +1327,12 @@ struct block_device *blkdev_get_no_open(dev_t dev)
struct block_device *bdev;
struct gendisk *disk;
- down_read(&bdev_lookup_sem);
bdev = bdget(dev);
if (!bdev) {
- up_read(&bdev_lookup_sem);
blk_request_module(dev);
- down_read(&bdev_lookup_sem);
-
bdev = bdget(dev);
if (!bdev)
- goto unlock;
+ return NULL;
}
disk = bdev->bd_disk;
@@ -1383,14 +1342,11 @@ struct block_device *blkdev_get_no_open(dev_t dev)
goto put_disk;
if (!try_module_get(bdev->bd_disk->fops->owner))
goto put_disk;
- up_read(&bdev_lookup_sem);
return bdev;
put_disk:
put_disk(disk);
bdput:
bdput(bdev);
-unlock:
- up_read(&bdev_lookup_sem);
return NULL;
}
@@ -1449,8 +1405,14 @@ struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder)
disk_block_events(disk);
- mutex_lock(&bdev->bd_mutex);
- ret =__blkdev_get(bdev, mode);
+ mutex_lock(&disk->open_mutex);
+ ret = -ENXIO;
+ if (!(disk->flags & GENHD_FL_UP))
+ goto abort_claiming;
+ if (bdev_is_partition(bdev))
+ ret = blkdev_get_part(bdev, mode);
+ else
+ ret = blkdev_get_whole(bdev, mode);
if (ret)
goto abort_claiming;
if (mode & FMODE_EXCL) {
@@ -1469,7 +1431,7 @@ struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder)
unblock_events = false;
}
}
- mutex_unlock(&bdev->bd_mutex);
+ mutex_unlock(&disk->open_mutex);
if (unblock_events)
disk_unblock_events(disk);
@@ -1478,7 +1440,7 @@ struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder)
abort_claiming:
if (mode & FMODE_EXCL)
bd_abort_claiming(bdev, holder);
- mutex_unlock(&bdev->bd_mutex);
+ mutex_unlock(&disk->open_mutex);
disk_unblock_events(disk);
put_blkdev:
blkdev_put_no_open(bdev);
@@ -1553,10 +1515,9 @@ static int blkdev_open(struct inode * inode, struct file * filp)
return 0;
}
-static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
+void blkdev_put(struct block_device *bdev, fmode_t mode)
{
struct gendisk *disk = bdev->bd_disk;
- struct block_device *victim = NULL;
/*
* Sync early if it looks like we're the last one. If someone else
@@ -1568,41 +1529,14 @@ static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
if (bdev->bd_openers == 1)
sync_blockdev(bdev);
- mutex_lock_nested(&bdev->bd_mutex, for_part);
- if (for_part)
- bdev->bd_part_count--;
-
- if (!--bdev->bd_openers) {
- WARN_ON_ONCE(bdev->bd_holders);
- sync_blockdev(bdev);
- kill_bdev(bdev);
- bdev_write_inode(bdev);
- if (bdev_is_partition(bdev))
- victim = bdev_whole(bdev);
- }
-
- if (!bdev_is_partition(bdev) && disk->fops->release)
- disk->fops->release(disk, mode);
- mutex_unlock(&bdev->bd_mutex);
- if (victim) {
- __blkdev_put(victim, mode, 1);
- bdput(victim);
- }
-}
-
-void blkdev_put(struct block_device *bdev, fmode_t mode)
-{
- struct gendisk *disk = bdev->bd_disk;
-
- mutex_lock(&bdev->bd_mutex);
-
+ mutex_lock(&disk->open_mutex);
if (mode & FMODE_EXCL) {
struct block_device *whole = bdev_whole(bdev);
bool bdev_free;
/*
* Release a claim on the device. The holder fields
- * are protected with bdev_lock. bd_mutex is to
+ * are protected with bdev_lock. open_mutex is to
* synchronize disk_holder unlinking.
*/
spin_lock(&bdev_lock);
@@ -1633,9 +1567,13 @@ void blkdev_put(struct block_device *bdev, fmode_t mode)
* from userland - e.g. eject(1).
*/
disk_flush_events(disk, DISK_EVENT_MEDIA_CHANGE);
- mutex_unlock(&bdev->bd_mutex);
- __blkdev_put(bdev, mode, 0);
+ if (bdev_is_partition(bdev))
+ blkdev_put_part(bdev, mode);
+ else
+ blkdev_put_whole(bdev, mode);
+ mutex_unlock(&disk->open_mutex);
+
blkdev_put_no_open(bdev);
}
EXPORT_SYMBOL(blkdev_put);
@@ -1735,20 +1673,6 @@ ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
}
EXPORT_SYMBOL_GPL(blkdev_read_iter);
-/*
- * Try to release a page associated with block device when the system
- * is under memory pressure.
- */
-static int blkdev_releasepage(struct page *page, gfp_t wait)
-{
- struct super_block *super = BDEV_I(page->mapping->host)->bdev.bd_super;
-
- if (super && super->s_op->bdev_try_to_free_page)
- return super->s_op->bdev_try_to_free_page(super, page, wait);
-
- return try_to_free_buffers(page);
-}
-
static int blkdev_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
@@ -1756,13 +1680,13 @@ static int blkdev_writepages(struct address_space *mapping,
}
static const struct address_space_operations def_blk_aops = {
+ .set_page_dirty = __set_page_dirty_buffers,
.readpage = blkdev_readpage,
.readahead = blkdev_readahead,
.writepage = blkdev_writepage,
.write_begin = blkdev_write_begin,
.write_end = blkdev_write_end,
.writepages = blkdev_writepages,
- .releasepage = blkdev_releasepage,
.direct_IO = blkdev_direct_IO,
.migratepage = buffer_migrate_page_norefs,
.is_dirty_writeback = buffer_check_dirty_writeback,
@@ -1942,10 +1866,10 @@ void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg)
old_inode = inode;
bdev = I_BDEV(inode);
- mutex_lock(&bdev->bd_mutex);
+ mutex_lock(&bdev->bd_disk->open_mutex);
if (bdev->bd_openers)
func(bdev, arg);
- mutex_unlock(&bdev->bd_mutex);
+ mutex_unlock(&bdev->bd_disk->open_mutex);
spin_lock(&blockdev_superblock->s_inode_list_lock);
}
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index 68b95ad82126..520a0f6a7d9e 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -18,6 +18,8 @@ config BTRFS_FS
select RAID6_PQ
select XOR_BLOCKS
select SRCU
+ depends on !PPC_256K_PAGES # powerpc
+ depends on !PAGE_SIZE_256KB # hexagon
help
Btrfs is a general purpose copy-on-write filesystem with extents,
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 117d423fdb93..7a8a2fc19533 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -2675,7 +2675,7 @@ static int handle_direct_tree_backref(struct btrfs_backref_cache *cache,
*
* @ref_key: The same as @ref_key in handle_direct_tree_backref()
* @tree_key: The first key of this tree block.
- * @path: A clean (released) path, to avoid allocating path everytime
+ * @path: A clean (released) path, to avoid allocating path every time
* the function get called.
*/
static int handle_indirect_tree_backref(struct btrfs_backref_cache *cache,
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index aa57bdc8fc89..38b127b9edfc 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -1399,7 +1399,6 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
btrfs_space_info_update_bytes_pinned(fs_info, space_info,
-block_group->pinned);
space_info->bytes_readonly += block_group->pinned;
- __btrfs_mod_total_bytes_pinned(space_info, -block_group->pinned);
block_group->pinned = 0;
spin_unlock(&block_group->lock);
@@ -1491,7 +1490,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
container_of(work, struct btrfs_fs_info, reclaim_bgs_work);
struct btrfs_block_group *bg;
struct btrfs_space_info *space_info;
- int ret;
+ LIST_HEAD(again_list);
if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
return;
@@ -1502,6 +1501,8 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
mutex_lock(&fs_info->reclaim_bgs_lock);
spin_lock(&fs_info->unused_bgs_lock);
while (!list_empty(&fs_info->reclaim_bgs)) {
+ int ret = 0;
+
bg = list_first_entry(&fs_info->reclaim_bgs,
struct btrfs_block_group,
bg_list);
@@ -1547,9 +1548,13 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
bg->start);
next:
- btrfs_put_block_group(bg);
spin_lock(&fs_info->unused_bgs_lock);
+ if (ret == -EAGAIN && list_empty(&bg->bg_list))
+ list_add_tail(&bg->bg_list, &again_list);
+ else
+ btrfs_put_block_group(bg);
}
+ list_splice_tail(&again_list, &fs_info->reclaim_bgs);
spin_unlock(&fs_info->unused_bgs_lock);
mutex_unlock(&fs_info->reclaim_bgs_lock);
btrfs_exclop_finish(fs_info);
@@ -2442,16 +2447,16 @@ void btrfs_dec_block_group_ro(struct btrfs_block_group *cache)
spin_lock(&sinfo->lock);
spin_lock(&cache->lock);
if (!--cache->ro) {
- num_bytes = cache->length - cache->reserved -
- cache->pinned - cache->bytes_super -
- cache->zone_unusable - cache->used;
- sinfo->bytes_readonly -= num_bytes;
if (btrfs_is_zoned(cache->fs_info)) {
/* Migrate zone_unusable bytes back */
cache->zone_unusable = cache->alloc_offset - cache->used;
sinfo->bytes_zone_unusable += cache->zone_unusable;
sinfo->bytes_readonly -= cache->zone_unusable;
}
+ num_bytes = cache->length - cache->reserved -
+ cache->pinned - cache->bytes_super -
+ cache->zone_unusable - cache->used;
+ sinfo->bytes_readonly -= num_bytes;
list_del_init(&cache->ro_list);
}
spin_unlock(&cache->lock);
@@ -2505,7 +2510,7 @@ static int cache_save_setup(struct btrfs_block_group *block_group,
struct extent_changeset *data_reserved = NULL;
u64 alloc_hint = 0;
int dcs = BTRFS_DC_ERROR;
- u64 num_pages = 0;
+ u64 cache_size = 0;
int retries = 0;
int ret = 0;
@@ -2617,20 +2622,20 @@ again:
* taking up quite a bit since it's not folded into the other space
* cache.
*/
- num_pages = div_u64(block_group->length, SZ_256M);
- if (!num_pages)
- num_pages = 1;
+ cache_size = div_u64(block_group->length, SZ_256M);
+ if (!cache_size)
+ cache_size = 1;
- num_pages *= 16;
- num_pages *= PAGE_SIZE;
+ cache_size *= 16;
+ cache_size *= fs_info->sectorsize;
ret = btrfs_check_data_free_space(BTRFS_I(inode), &data_reserved, 0,
- num_pages);
+ cache_size);
if (ret)
goto out_put;
- ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
- num_pages, num_pages,
+ ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, cache_size,
+ cache_size, cache_size,
&alloc_hint);
/*
* Our cache requires contiguous chunks so that we don't modify a bunch
@@ -3062,8 +3067,6 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
spin_unlock(&cache->lock);
spin_unlock(&cache->space_info->lock);
- __btrfs_mod_total_bytes_pinned(cache->space_info,
- num_bytes);
set_extent_dirty(&trans->transaction->pinned_extents,
bytenr, bytenr + num_bytes - 1,
GFP_NOFS | __GFP_NOFAIL);
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 2bea01d23a5b..9a023ae0f98b 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -28,6 +28,7 @@
#include "compression.h"
#include "extent_io.h"
#include "extent_map.h"
+#include "zoned.h"
static const char* const btrfs_compress_types[] = { "", "zlib", "lzo", "zstd" };
@@ -148,7 +149,7 @@ static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio,
const u32 csum_size = fs_info->csum_size;
const u32 sectorsize = fs_info->sectorsize;
struct page *page;
- unsigned long i;
+ unsigned int i;
char *kaddr;
u8 csum[BTRFS_CSUM_SIZE];
struct compressed_bio *cb = bio->bi_private;
@@ -207,7 +208,7 @@ static void end_compressed_bio_read(struct bio *bio)
struct compressed_bio *cb = bio->bi_private;
struct inode *inode;
struct page *page;
- unsigned long index;
+ unsigned int index;
unsigned int mirror = btrfs_io_bio(bio)->mirror_num;
int ret = 0;
@@ -333,7 +334,7 @@ static void end_compressed_bio_write(struct bio *bio)
struct compressed_bio *cb = bio->bi_private;
struct inode *inode;
struct page *page;
- unsigned long index;
+ unsigned int index;
if (bio->bi_status)
cb->errors = 1;
@@ -348,11 +349,10 @@ static void end_compressed_bio_write(struct bio *bio)
* call back into the FS and do all the end_io operations
*/
inode = cb->inode;
- cb->compressed_pages[0]->mapping = cb->inode->i_mapping;
- btrfs_writepage_endio_finish_ordered(cb->compressed_pages[0],
+ btrfs_record_physical_zoned(inode, cb->start, bio);
+ btrfs_writepage_endio_finish_ordered(BTRFS_I(inode), NULL,
cb->start, cb->start + cb->len - 1,
bio->bi_status == BLK_STS_OK);
- cb->compressed_pages[0]->mapping = NULL;
end_compressed_writeback(inode, cb);
/* note, our inode could be gone now */
@@ -385,10 +385,10 @@ out:
* the end io hooks.
*/
blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
- unsigned long len, u64 disk_start,
- unsigned long compressed_len,
+ unsigned int len, u64 disk_start,
+ unsigned int compressed_len,
struct page **compressed_pages,
- unsigned long nr_pages,
+ unsigned int nr_pages,
unsigned int write_flags,
struct cgroup_subsys_state *blkcg_css)
{
@@ -401,6 +401,8 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
u64 first_byte = disk_start;
blk_status_t ret;
int skip_sum = inode->flags & BTRFS_INODE_NODATASUM;
+ const bool use_append = btrfs_use_zone_append(inode, disk_start);
+ const unsigned int bio_op = use_append ? REQ_OP_ZONE_APPEND : REQ_OP_WRITE;
WARN_ON(!PAGE_ALIGNED(start));
cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS);
@@ -418,10 +420,23 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
cb->nr_pages = nr_pages;
bio = btrfs_bio_alloc(first_byte);
- bio->bi_opf = REQ_OP_WRITE | write_flags;
+ bio->bi_opf = bio_op | write_flags;
bio->bi_private = cb;
bio->bi_end_io = end_compressed_bio_write;
+ if (use_append) {
+ struct btrfs_device *device;
+
+ device = btrfs_zoned_get_device(fs_info, disk_start, PAGE_SIZE);
+ if (IS_ERR(device)) {
+ kfree(cb);
+ bio_put(bio);
+ return BLK_STS_NOTSUPP;
+ }
+
+ bio_set_dev(bio, device->bdev);
+ }
+
if (blkcg_css) {
bio->bi_opf |= REQ_CGROUP_PUNT;
kthread_associate_blkcg(blkcg_css);
@@ -432,6 +447,7 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
bytes_left = compressed_len;
for (pg_index = 0; pg_index < cb->nr_pages; pg_index++) {
int submit = 0;
+ int len = 0;
page = compressed_pages[pg_index];
page->mapping = inode->vfs_inode.i_mapping;
@@ -439,9 +455,20 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
submit = btrfs_bio_fits_in_stripe(page, PAGE_SIZE, bio,
0);
+ /*
+ * Page can only be added to bio if the current bio fits in
+ * stripe.
+ */
+ if (!submit) {
+ if (pg_index == 0 && use_append)
+ len = bio_add_zone_append_page(bio, page,
+ PAGE_SIZE, 0);
+ else
+ len = bio_add_page(bio, page, PAGE_SIZE, 0);
+ }
+
page->mapping = NULL;
- if (submit || bio_add_page(bio, page, PAGE_SIZE, 0) <
- PAGE_SIZE) {
+ if (submit || len < PAGE_SIZE) {
/*
* inc the count before we submit the bio so
* we know the end IO handler won't happen before
@@ -465,16 +492,20 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
}
bio = btrfs_bio_alloc(first_byte);
- bio->bi_opf = REQ_OP_WRITE | write_flags;
+ bio->bi_opf = bio_op | write_flags;
bio->bi_private = cb;
bio->bi_end_io = end_compressed_bio_write;
if (blkcg_css)
bio->bi_opf |= REQ_CGROUP_PUNT;
+ /*
+ * Use bio_add_page() to ensure the bio has at least one
+ * page.
+ */
bio_add_page(bio, page, PAGE_SIZE, 0);
}
if (bytes_left < PAGE_SIZE) {
btrfs_info(fs_info,
- "bytes left %lu compress len %lu nr %lu",
+ "bytes left %lu compress len %u nr %u",
bytes_left, cb->compressed_len, cb->nr_pages);
}
bytes_left -= PAGE_SIZE;
@@ -636,9 +667,9 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct extent_map_tree *em_tree;
struct compressed_bio *cb;
- unsigned long compressed_len;
- unsigned long nr_pages;
- unsigned long pg_index;
+ unsigned int compressed_len;
+ unsigned int nr_pages;
+ unsigned int pg_index;
struct page *page;
struct bio *comp_bio;
u64 cur_disk_byte = bio->bi_iter.bi_sector << 9;
@@ -1161,9 +1192,6 @@ static unsigned int btrfs_compress_set_level(int type, unsigned level)
*
* @total_out is an in/out parameter, must be set to the input length and will
* be also used to return the total number of compressed bytes
- *
- * @max_out tells us the max number of bytes that we're allowed to
- * stuff into pages
*/
int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping,
u64 start, struct page **pages,
@@ -1184,20 +1212,6 @@ int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping,
return ret;
}
-/*
- * pages_in is an array of pages with compressed data.
- *
- * disk_start is the starting logical offset of this array in the file
- *
- * orig_bio contains the pages from the file that we want to decompress into
- *
- * srclen is the number of bytes in pages_in
- *
- * The basic idea is that we have a bio that was created by readpages.
- * The pages in the bio are for the uncompressed data, and they may not
- * be contiguous. They all correspond to the range of bytes covered by
- * the compressed extent.
- */
static int btrfs_decompress_bio(struct compressed_bio *cb)
{
struct list_head *workspace;
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 8001b700ea3a..c359f20920d0 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -31,6 +31,9 @@ struct compressed_bio {
/* number of bios pending for this compressed extent */
refcount_t pending_bios;
+ /* Number of compressed pages in the array */
+ unsigned int nr_pages;
+
/* the pages with the compressed data on them */
struct page **compressed_pages;
@@ -40,20 +43,17 @@ struct compressed_bio {
/* starting offset in the inode for our pages */
u64 start;
- /* number of bytes in the inode we're working on */
- unsigned long len;
-
- /* number of bytes on disk */
- unsigned long compressed_len;
+ /* Number of bytes in the inode we're working on */
+ unsigned int len;
- /* the compression algorithm for this bio */
- int compress_type;
+ /* Number of bytes on disk */
+ unsigned int compressed_len;
- /* number of compressed pages in the array */
- unsigned long nr_pages;
+ /* The compression algorithm for this bio */
+ u8 compress_type;
/* IO errors */
- int errors;
+ u8 errors;
int mirror_num;
/* for reads, this is the bio we are copying the data into */
@@ -91,10 +91,10 @@ int btrfs_decompress_buf2page(const char *buf, unsigned long buf_start,
struct bio *bio);
blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
- unsigned long len, u64 disk_start,
- unsigned long compressed_len,
+ unsigned int len, u64 disk_start,
+ unsigned int compressed_len,
struct page **compressed_pages,
- unsigned long nr_pages,
+ unsigned int nr_pages,
unsigned int write_flags,
struct cgroup_subsys_state *blkcg_css);
blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index a484fb72a01f..4bc3ca2cbd7d 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -596,7 +596,6 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
trans->transid, fs_info->generation);
if (!should_cow_block(trans, root, buf)) {
- trans->dirty = true;
*cow_ret = buf;
return 0;
}
@@ -1788,10 +1787,8 @@ again:
* then we don't want to set the path blocking,
* so we test it here
*/
- if (!should_cow_block(trans, root, b)) {
- trans->dirty = true;
+ if (!should_cow_block(trans, root, b))
goto cow_done;
- }
/*
* must have write locks on this node and the
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 9fb76829a281..e5e53e592d4f 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -561,10 +561,16 @@ enum {
/*
* Indicate that balance has been set up from the ioctl and is in the
* main phase. The fs_info::balance_ctl is initialized.
- * Set and cleared while holding fs_info::balance_mutex.
*/
BTRFS_FS_BALANCE_RUNNING,
+ /*
+ * Indicate that relocation of a chunk has started, it's set per chunk
+ * and is toggled between chunks.
+ * Set, tested and cleared while holding fs_info::send_reloc_lock.
+ */
+ BTRFS_FS_RELOC_RUNNING,
+
/* Indicate that the cleaner thread is awake and doing something. */
BTRFS_FS_CLEANER_RUNNING,
@@ -817,8 +823,6 @@ struct btrfs_fs_info {
struct kobject *space_info_kobj;
struct kobject *qgroups_kobj;
- u64 total_pinned;
-
/* used to keep from writing metadata until there is a nice batch */
struct percpu_counter dirty_metadata_bytes;
struct percpu_counter delalloc_bytes;
@@ -871,6 +875,9 @@ struct btrfs_fs_info {
struct btrfs_balance_control *balance_ctl;
wait_queue_head_t balance_wait_q;
+ /* Cancellation requests for chunk relocation */
+ atomic_t reloc_cancel_req;
+
u32 data_chunk_allocations;
u32 metadata_ratio;
@@ -986,14 +993,15 @@ struct btrfs_fs_info {
struct crypto_shash *csum_shash;
+ spinlock_t send_reloc_lock;
/*
* Number of send operations in progress.
- * Updated while holding fs_info::balance_mutex.
+ * Updated while holding fs_info::send_reloc_lock.
*/
int send_in_progress;
- /* Type of exclusive operation running */
- unsigned long exclusive_operation;
+ /* Type of exclusive operation running, protected by super_lock */
+ enum btrfs_exclusive_operation exclusive_operation;
/*
* Zone size > 0 when in ZONED mode, otherwise it's used for a check
@@ -1375,38 +1383,39 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info)
*
* Note: don't forget to add new options to btrfs_show_options()
*/
-#define BTRFS_MOUNT_NODATASUM (1 << 0)
-#define BTRFS_MOUNT_NODATACOW (1 << 1)
-#define BTRFS_MOUNT_NOBARRIER (1 << 2)
-#define BTRFS_MOUNT_SSD (1 << 3)
-#define BTRFS_MOUNT_DEGRADED (1 << 4)
-#define BTRFS_MOUNT_COMPRESS (1 << 5)
-#define BTRFS_MOUNT_NOTREELOG (1 << 6)
-#define BTRFS_MOUNT_FLUSHONCOMMIT (1 << 7)
-#define BTRFS_MOUNT_SSD_SPREAD (1 << 8)
-#define BTRFS_MOUNT_NOSSD (1 << 9)
-#define BTRFS_MOUNT_DISCARD_SYNC (1 << 10)
-#define BTRFS_MOUNT_FORCE_COMPRESS (1 << 11)
-#define BTRFS_MOUNT_SPACE_CACHE (1 << 12)
-#define BTRFS_MOUNT_CLEAR_CACHE (1 << 13)
-#define BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED (1 << 14)
-#define BTRFS_MOUNT_ENOSPC_DEBUG (1 << 15)
-#define BTRFS_MOUNT_AUTO_DEFRAG (1 << 16)
-/* bit 17 is free */
-#define BTRFS_MOUNT_USEBACKUPROOT (1 << 18)
-#define BTRFS_MOUNT_SKIP_BALANCE (1 << 19)
-#define BTRFS_MOUNT_CHECK_INTEGRITY (1 << 20)
-#define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21)
-#define BTRFS_MOUNT_PANIC_ON_FATAL_ERROR (1 << 22)
-#define BTRFS_MOUNT_RESCAN_UUID_TREE (1 << 23)
-#define BTRFS_MOUNT_FRAGMENT_DATA (1 << 24)
-#define BTRFS_MOUNT_FRAGMENT_METADATA (1 << 25)
-#define BTRFS_MOUNT_FREE_SPACE_TREE (1 << 26)
-#define BTRFS_MOUNT_NOLOGREPLAY (1 << 27)
-#define BTRFS_MOUNT_REF_VERIFY (1 << 28)
-#define BTRFS_MOUNT_DISCARD_ASYNC (1 << 29)
-#define BTRFS_MOUNT_IGNOREBADROOTS (1 << 30)
-#define BTRFS_MOUNT_IGNOREDATACSUMS (1 << 31)
+enum {
+ BTRFS_MOUNT_NODATASUM = (1UL << 0),
+ BTRFS_MOUNT_NODATACOW = (1UL << 1),
+ BTRFS_MOUNT_NOBARRIER = (1UL << 2),
+ BTRFS_MOUNT_SSD = (1UL << 3),
+ BTRFS_MOUNT_DEGRADED = (1UL << 4),
+ BTRFS_MOUNT_COMPRESS = (1UL << 5),
+ BTRFS_MOUNT_NOTREELOG = (1UL << 6),
+ BTRFS_MOUNT_FLUSHONCOMMIT = (1UL << 7),
+ BTRFS_MOUNT_SSD_SPREAD = (1UL << 8),
+ BTRFS_MOUNT_NOSSD = (1UL << 9),
+ BTRFS_MOUNT_DISCARD_SYNC = (1UL << 10),
+ BTRFS_MOUNT_FORCE_COMPRESS = (1UL << 11),
+ BTRFS_MOUNT_SPACE_CACHE = (1UL << 12),
+ BTRFS_MOUNT_CLEAR_CACHE = (1UL << 13),
+ BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED = (1UL << 14),
+ BTRFS_MOUNT_ENOSPC_DEBUG = (1UL << 15),
+ BTRFS_MOUNT_AUTO_DEFRAG = (1UL << 16),
+ BTRFS_MOUNT_USEBACKUPROOT = (1UL << 17),
+ BTRFS_MOUNT_SKIP_BALANCE = (1UL << 18),
+ BTRFS_MOUNT_CHECK_INTEGRITY = (1UL << 19),
+ BTRFS_MOUNT_CHECK_INTEGRITY_DATA = (1UL << 20),
+ BTRFS_MOUNT_PANIC_ON_FATAL_ERROR = (1UL << 21),
+ BTRFS_MOUNT_RESCAN_UUID_TREE = (1UL << 22),
+ BTRFS_MOUNT_FRAGMENT_DATA = (1UL << 23),
+ BTRFS_MOUNT_FRAGMENT_METADATA = (1UL << 24),
+ BTRFS_MOUNT_FREE_SPACE_TREE = (1UL << 25),
+ BTRFS_MOUNT_NOLOGREPLAY = (1UL << 26),
+ BTRFS_MOUNT_REF_VERIFY = (1UL << 27),
+ BTRFS_MOUNT_DISCARD_ASYNC = (1UL << 28),
+ BTRFS_MOUNT_IGNOREBADROOTS = (1UL << 29),
+ BTRFS_MOUNT_IGNOREDATACSUMS = (1UL << 30),
+};
#define BTRFS_DEFAULT_COMMIT_INTERVAL (30)
#define BTRFS_DEFAULT_MAX_INLINE (2048)
@@ -2216,11 +2225,13 @@ BTRFS_SETGET_STACK_FUNCS(root_rtransid, struct btrfs_root_item,
static inline bool btrfs_root_readonly(const struct btrfs_root *root)
{
+ /* Byte-swap the constant at compile time, root_item::flags is LE */
return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_RDONLY)) != 0;
}
static inline bool btrfs_root_dead(const struct btrfs_root *root)
{
+ /* Byte-swap the constant at compile time, root_item::flags is LE */
return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_DEAD)) != 0;
}
@@ -2746,9 +2757,9 @@ enum btrfs_reserve_flush_enum {
/*
* Flush space by above mentioned methods and by:
* - Running delayed iputs
- * - Commiting transaction
+ * - Committing transaction
*
- * Can be interruped by fatal signal.
+ * Can be interrupted by a fatal signal.
*/
BTRFS_RESERVE_FLUSH_DATA,
BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE,
@@ -2758,7 +2769,7 @@ enum btrfs_reserve_flush_enum {
* Pretty much the same as FLUSH_ALL, but can also steal space from
* global rsv.
*
- * Can be interruped by fatal signal.
+ * Can be interrupted by a fatal signal.
*/
BTRFS_RESERVE_FLUSH_ALL_STEAL,
};
@@ -2774,7 +2785,6 @@ enum btrfs_flush_state {
ALLOC_CHUNK_FORCE = 8,
RUN_DELAYED_IPUTS = 9,
COMMIT_TRANS = 10,
- FORCE_COMMIT_TRANS = 11,
};
int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
@@ -3100,8 +3110,8 @@ u64 btrfs_file_extent_end(const struct btrfs_path *path);
/* inode.c */
blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio,
int mirror_num, unsigned long bio_flags);
-int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset,
- struct page *page, u64 start, u64 end);
+unsigned int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset,
+ struct page *page, u64 start, u64 end);
struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
u64 start, u64 len);
noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
@@ -3125,7 +3135,7 @@ int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_inode *inode, u64 new_size,
- u32 min_type);
+ u32 min_type, u64 *extents_found);
int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context);
int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr,
@@ -3146,9 +3156,7 @@ void btrfs_split_delalloc_extent(struct inode *inode,
struct extent_state *orig, u64 split);
int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio,
unsigned long bio_flags);
-bool btrfs_bio_fits_in_ordered_extent(struct page *page, struct bio *bio,
- unsigned int size);
-void btrfs_set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end);
+void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end);
vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf);
int btrfs_readpage(struct file *file, struct page *page);
void btrfs_evict_inode(struct inode *inode);
@@ -3187,7 +3195,8 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page
u64 start, u64 end, int *page_started, unsigned long *nr_written,
struct writeback_control *wbc);
int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end);
-void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start,
+void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode,
+ struct page *page, u64 start,
u64 end, int uptodate);
extern const struct dentry_operations btrfs_dentry_operations;
extern const struct iomap_ops btrfs_dio_iomap_ops;
@@ -3222,6 +3231,9 @@ void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_balance_args *bargs);
bool btrfs_exclop_start(struct btrfs_fs_info *fs_info,
enum btrfs_exclusive_operation type);
+bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info,
+ enum btrfs_exclusive_operation type);
+void btrfs_exclop_start_unlock(struct btrfs_fs_info *fs_info);
void btrfs_exclop_finish(struct btrfs_fs_info *fs_info);
/* file.c */
@@ -3786,4 +3798,14 @@ static inline bool btrfs_is_zoned(const struct btrfs_fs_info *fs_info)
return fs_info->zoned != 0;
}
+/*
+ * We use page status Private2 to indicate there is an ordered extent with
+ * unfinished IO.
+ *
+ * Rename the Private2 accessors to Ordered, to improve readability.
+ */
+#define PageOrdered(page) PagePrivate2(page)
+#define SetPageOrdered(page) SetPagePrivate2(page)
+#define ClearPageOrdered(page) ClearPagePrivate2(page)
+
#endif
diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c
index 56642ca7af10..2059d1504149 100644
--- a/fs/btrfs/delalloc-space.c
+++ b/fs/btrfs/delalloc-space.c
@@ -89,7 +89,7 @@
* ->outstanding_extents += 1 (current value is 1)
*
* -> set_delalloc
- * ->outstanding_extents += 1 (currrent value is 2)
+ * ->outstanding_extents += 1 (current value is 2)
*
* -> btrfs_delalloc_release_extents()
* ->outstanding_extents -= 1 (current value is 1)
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 1a88f6214ebc..257c1e18abd4 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -681,7 +681,7 @@ static int btrfs_batch_insert_items(struct btrfs_root *root,
{
struct btrfs_delayed_item *curr, *next;
int free_space;
- int total_data_size = 0, total_size = 0;
+ int total_size = 0;
struct extent_buffer *leaf;
char *data_ptr;
struct btrfs_key *keys;
@@ -706,7 +706,6 @@ static int btrfs_batch_insert_items(struct btrfs_root *root,
*/
while (total_size + next->data_len + sizeof(struct btrfs_item) <=
free_space) {
- total_data_size += next->data_len;
total_size += next->data_len + sizeof(struct btrfs_item);
list_add_tail(&next->tree_list, &head);
nitems++;
@@ -974,14 +973,16 @@ static void btrfs_release_delayed_inode(struct btrfs_delayed_node *delayed_node)
static void btrfs_release_delayed_iref(struct btrfs_delayed_node *delayed_node)
{
- struct btrfs_delayed_root *delayed_root;
- ASSERT(delayed_node->root);
- clear_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags);
- delayed_node->count--;
+ if (test_and_clear_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags)) {
+ struct btrfs_delayed_root *delayed_root;
- delayed_root = delayed_node->root->fs_info->delayed_root;
- finish_one_item(delayed_root);
+ ASSERT(delayed_node->root);
+ delayed_node->count--;
+
+ delayed_root = delayed_node->root->fs_info->delayed_root;
+ finish_one_item(delayed_root);
+ }
}
static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
@@ -1009,12 +1010,10 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
nofs_flag = memalloc_nofs_save();
ret = btrfs_lookup_inode(trans, root, path, &key, mod);
memalloc_nofs_restore(nofs_flag);
- if (ret > 0) {
- btrfs_release_path(path);
- return -ENOENT;
- } else if (ret < 0) {
- return ret;
- }
+ if (ret > 0)
+ ret = -ENOENT;
+ if (ret < 0)
+ goto out;
leaf = path->nodes[0];
inode_item = btrfs_item_ptr(leaf, path->slots[0],
@@ -1024,7 +1023,7 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(leaf);
if (!test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &node->flags))
- goto no_iref;
+ goto out;
path->slots[0]++;
if (path->slots[0] >= btrfs_header_nritems(leaf))
@@ -1046,12 +1045,19 @@ again:
btrfs_del_item(trans, root, path);
out:
btrfs_release_delayed_iref(node);
-no_iref:
btrfs_release_path(path);
err_out:
btrfs_delayed_inode_release_metadata(fs_info, node, (ret < 0));
btrfs_release_delayed_inode(node);
+ /*
+ * If we fail to update the delayed inode we need to abort the
+ * transaction, because we could leave the inode with the improper
+ * counts behind.
+ */
+ if (ret && ret != -ENOENT)
+ btrfs_abort_transaction(trans, ret);
+
return ret;
search:
@@ -1898,8 +1904,7 @@ static void __btrfs_kill_delayed_node(struct btrfs_delayed_node *delayed_node)
btrfs_release_delayed_item(prev_item);
}
- if (test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags))
- btrfs_release_delayed_iref(delayed_node);
+ btrfs_release_delayed_iref(delayed_node);
if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
btrfs_delayed_inode_release_metadata(fs_info, delayed_node, false);
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index c92d9d4f5f46..06bc842ecdb3 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -641,7 +641,6 @@ static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_root *delayed_refs =
&trans->transaction->delayed_refs;
struct btrfs_fs_info *fs_info = trans->fs_info;
- u64 flags = btrfs_ref_head_to_space_flags(existing);
int old_ref_mod;
BUG_ON(existing->is_data != update->is_data);
@@ -711,26 +710,6 @@ static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans,
}
}
- /*
- * This handles the following conditions:
- *
- * 1. We had a ref mod of 0 or more and went negative, indicating that
- * we may be freeing space, so add our space to the
- * total_bytes_pinned counter.
- * 2. We were negative and went to 0 or positive, so no longer can say
- * that the space would be pinned, decrement our counter from the
- * total_bytes_pinned counter.
- * 3. We are now at 0 and have ->must_insert_reserved set, which means
- * this was a new allocation and then we dropped it, and thus must
- * add our space to the total_bytes_pinned counter.
- */
- if (existing->total_ref_mod < 0 && old_ref_mod >= 0)
- btrfs_mod_total_bytes_pinned(fs_info, flags, existing->num_bytes);
- else if (existing->total_ref_mod >= 0 && old_ref_mod < 0)
- btrfs_mod_total_bytes_pinned(fs_info, flags, -existing->num_bytes);
- else if (existing->total_ref_mod == 0 && existing->must_insert_reserved)
- btrfs_mod_total_bytes_pinned(fs_info, flags, existing->num_bytes);
-
spin_unlock(&existing->lock);
}
@@ -835,17 +814,12 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
head_ref = existing;
} else {
- u64 flags = btrfs_ref_head_to_space_flags(head_ref);
-
if (head_ref->is_data && head_ref->ref_mod < 0) {
delayed_refs->pending_csums += head_ref->num_bytes;
trans->delayed_ref_updates +=
btrfs_csum_bytes_to_leaves(trans->fs_info,
head_ref->num_bytes);
}
- if (head_ref->ref_mod < 0)
- btrfs_mod_total_bytes_pinned(trans->fs_info, flags,
- head_ref->num_bytes);
delayed_refs->num_heads++;
delayed_refs->num_heads_ready++;
atomic_inc(&delayed_refs->num_entries);
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index d05f73530af7..d029be40ea6f 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -37,7 +37,7 @@
* - Write duplication
*
* All new writes will be written to both target and source devices, so even
- * if replace gets canceled, sources device still contans up-to-date data.
+ * if replace gets canceled, sources device still contains up-to-date data.
*
* Location: handle_ops_on_dev_replace() from __btrfs_map_block()
* Start: btrfs_dev_replace_start()
diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c
index 306ff20af70f..e1b7bd927d69 100644
--- a/fs/btrfs/discard.c
+++ b/fs/btrfs/discard.c
@@ -624,7 +624,7 @@ void btrfs_discard_update_discardable(struct btrfs_block_group *block_group)
* @fs_info: fs_info of interest
*
* The unused_bgs list needs to be punted to the discard lists because the
- * order of operations is changed. In the normal sychronous discard path, the
+ * order of operations is changed. In the normal synchronous discard path, the
* block groups are trimmed via a single large trim in transaction commit. This
* is ultimately what we are trying to avoid with asynchronous discard. Thus,
* it must be done before going down the unused_bgs path.
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index c9a3036c23bf..b117dd3b8172 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -241,7 +241,6 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
{
struct extent_state *cached_state = NULL;
int ret;
- bool need_lock = (current->journal_info == BTRFS_SEND_TRANS_STUB);
if (!parent_transid || btrfs_header_generation(eb) == parent_transid)
return 0;
@@ -249,9 +248,6 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
if (atomic)
return -EAGAIN;
- if (need_lock)
- btrfs_tree_read_lock(eb);
-
lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1,
&cached_state);
if (extent_buffer_uptodate(eb) &&
@@ -264,22 +260,10 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
eb->start,
parent_transid, btrfs_header_generation(eb));
ret = 1;
-
- /*
- * Things reading via commit roots that don't have normal protection,
- * like send, can have a really old block in cache that may point at a
- * block that has been freed and re-allocated. So don't clear uptodate
- * if we find an eb that is under IO (dirty/writeback) because we could
- * end up reading in the stale data and then writing it back out and
- * making everybody very sad.
- */
- if (!extent_buffer_under_io(eb))
- clear_extent_buffer_uptodate(eb);
+ clear_extent_buffer_uptodate(eb);
out:
unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1,
&cached_state);
- if (need_lock)
- btrfs_tree_read_unlock(eb);
return ret;
}
@@ -584,6 +568,7 @@ static int validate_extent_buffer(struct extent_buffer *eb)
const u32 csum_size = fs_info->csum_size;
u8 found_level;
u8 result[BTRFS_CSUM_SIZE];
+ const u8 *header_csum;
int ret = 0;
found_start = btrfs_header_bytenr(eb);
@@ -608,15 +593,14 @@ static int validate_extent_buffer(struct extent_buffer *eb)
}
csum_tree_block(eb, result);
+ header_csum = page_address(eb->pages[0]) +
+ get_eb_offset_in_page(eb, offsetof(struct btrfs_header, csum));
- if (memcmp_extent_buffer(eb, result, 0, csum_size)) {
- u8 val[BTRFS_CSUM_SIZE] = { 0 };
-
- read_extent_buffer(eb, &val, 0, csum_size);
+ if (memcmp(result, header_csum, csum_size) != 0) {
btrfs_warn_rl(fs_info,
- "%s checksum verify failed on %llu wanted " CSUM_FMT " found " CSUM_FMT " level %d",
- fs_info->sb->s_id, eb->start,
- CSUM_FMT_VALUE(csum_size, val),
+ "checksum verify failed on %llu wanted " CSUM_FMT " found " CSUM_FMT " level %d",
+ eb->start,
+ CSUM_FMT_VALUE(csum_size, header_csum),
CSUM_FMT_VALUE(csum_size, result),
btrfs_header_level(eb));
ret = -EUCLEAN;
@@ -917,23 +901,22 @@ static blk_status_t btree_submit_bio_start(struct inode *inode, struct bio *bio,
return btree_csum_one_bio(bio);
}
-static int check_async_write(struct btrfs_fs_info *fs_info,
+static bool should_async_write(struct btrfs_fs_info *fs_info,
struct btrfs_inode *bi)
{
if (btrfs_is_zoned(fs_info))
- return 0;
+ return false;
if (atomic_read(&bi->sync_writers))
- return 0;
+ return false;
if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags))
- return 0;
- return 1;
+ return false;
+ return true;
}
blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio,
int mirror_num, unsigned long bio_flags)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- int async = check_async_write(fs_info, BTRFS_I(inode));
blk_status_t ret;
if (btrfs_op(bio) != BTRFS_MAP_WRITE) {
@@ -946,7 +929,7 @@ blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio,
if (ret)
goto out_w_error;
ret = btrfs_map_bio(fs_info, bio, mirror_num);
- } else if (!async) {
+ } else if (!should_async_write(fs_info, BTRFS_I(inode))) {
ret = btree_csum_one_bio(bio);
if (ret)
goto out_w_error;
@@ -2252,6 +2235,7 @@ static void btrfs_init_balance(struct btrfs_fs_info *fs_info)
atomic_set(&fs_info->balance_cancel_req, 0);
fs_info->balance_ctl = NULL;
init_waitqueue_head(&fs_info->balance_wait_q);
+ atomic_set(&fs_info->reloc_cancel_req, 0);
}
static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info)
@@ -2648,6 +2632,24 @@ static int validate_super(struct btrfs_fs_info *fs_info,
ret = -EINVAL;
}
+ if (memcmp(fs_info->fs_devices->fsid, fs_info->super_copy->fsid,
+ BTRFS_FSID_SIZE)) {
+ btrfs_err(fs_info,
+ "superblock fsid doesn't match fsid of fs_devices: %pU != %pU",
+ fs_info->super_copy->fsid, fs_info->fs_devices->fsid);
+ ret = -EINVAL;
+ }
+
+ if (btrfs_fs_incompat(fs_info, METADATA_UUID) &&
+ memcmp(fs_info->fs_devices->metadata_uuid,
+ fs_info->super_copy->metadata_uuid, BTRFS_FSID_SIZE)) {
+ btrfs_err(fs_info,
+"superblock metadata_uuid doesn't match metadata uuid of fs_devices: %pU != %pU",
+ fs_info->super_copy->metadata_uuid,
+ fs_info->fs_devices->metadata_uuid);
+ ret = -EINVAL;
+ }
+
if (memcmp(fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid,
BTRFS_FSID_SIZE) != 0) {
btrfs_err(fs_info,
@@ -2981,6 +2983,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
spin_lock_init(&fs_info->swapfile_pins_lock);
fs_info->swapfile_pins = RB_ROOT;
+ spin_lock_init(&fs_info->send_reloc_lock);
fs_info->send_in_progress = 0;
fs_info->bg_reclaim_threshold = BTRFS_DEFAULT_RECLAIM_THRESH;
@@ -3279,14 +3282,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
disk_super = fs_info->super_copy;
- ASSERT(!memcmp(fs_info->fs_devices->fsid, fs_info->super_copy->fsid,
- BTRFS_FSID_SIZE));
-
- if (btrfs_fs_incompat(fs_info, METADATA_UUID)) {
- ASSERT(!memcmp(fs_info->fs_devices->metadata_uuid,
- fs_info->super_copy->metadata_uuid,
- BTRFS_FSID_SIZE));
- }
features = btrfs_super_flags(disk_super);
if (features & BTRFS_SUPER_FLAG_CHANGING_FSID_V2) {
@@ -3461,7 +3456,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
* At this point we know all the devices that make this filesystem,
* including the seed devices but we don't know yet if the replace
* target is required. So free devices that are not part of this
- * filesystem but skip the replace traget device which is checked
+ * filesystem but skip the replace target device which is checked
* below in btrfs_init_dev_replace().
*/
btrfs_free_extra_devids(fs_devices);
@@ -3588,8 +3583,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
if (btrfs_test_opt(fs_info, CHECK_INTEGRITY)) {
ret = btrfsic_mount(fs_info, fs_devices,
btrfs_test_opt(fs_info,
- CHECK_INTEGRITY_INCLUDING_EXTENT_DATA) ?
- 1 : 0,
+ CHECK_INTEGRITY_DATA) ? 1 : 0,
fs_info->check_integrity_print_mask);
if (ret)
btrfs_warn(fs_info,
@@ -4686,9 +4680,6 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
cache->space_info->bytes_reserved -= head->num_bytes;
spin_unlock(&cache->lock);
spin_unlock(&cache->space_info->lock);
- percpu_counter_add_batch(
- &cache->space_info->total_bytes_pinned,
- head->num_bytes, BTRFS_TOTAL_BYTES_PINNED_BATCH);
btrfs_put_block_group(cache);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index f1d15b68994a..d296483d148f 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1425,7 +1425,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
* bytenr of the parent block. Since new extents are always
* created with indirect references, this will only be the case
* when relocating a shared extent. In that case, root_objectid
- * will be BTRFS_TREE_RELOC_OBJECTID. Otheriwse, parent must
+ * will be BTRFS_TREE_RELOC_OBJECTID. Otherwise, parent must
* be 0
*
* @root_objectid: The id of the root where this modification has originated,
@@ -1804,19 +1804,6 @@ void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
nr_items += btrfs_csum_bytes_to_leaves(fs_info, head->num_bytes);
}
- /*
- * We were dropping refs, or had a new ref and dropped it, and thus must
- * adjust down our total_bytes_pinned, the space may or may not have
- * been pinned and so is accounted for properly in the pinned space by
- * now.
- */
- if (head->total_ref_mod < 0 ||
- (head->total_ref_mod == 0 && head->must_insert_reserved)) {
- u64 flags = btrfs_ref_head_to_space_flags(head);
-
- btrfs_mod_total_bytes_pinned(fs_info, flags, -head->num_bytes);
- }
-
btrfs_delayed_refs_rsv_release(fs_info, nr_items);
}
@@ -1868,7 +1855,7 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans,
trace_run_delayed_ref_head(fs_info, head, 0);
btrfs_delayed_ref_unlock(head);
btrfs_put_delayed_ref_head(head);
- return 0;
+ return ret;
}
static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head(
@@ -2551,7 +2538,6 @@ static int pin_down_extent(struct btrfs_trans_handle *trans,
spin_unlock(&cache->lock);
spin_unlock(&cache->space_info->lock);
- __btrfs_mod_total_bytes_pinned(cache->space_info, num_bytes);
set_extent_dirty(&trans->transaction->pinned_extents, bytenr,
bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
return 0;
@@ -2762,7 +2748,6 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
cache->pinned -= len;
btrfs_space_info_update_bytes_pinned(fs_info, space_info, -len);
space_info->max_extent_size = 0;
- __btrfs_mod_total_bytes_pinned(space_info, -len);
if (cache->ro) {
space_info->bytes_readonly += len;
readonly = true;
@@ -4784,7 +4769,6 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
buf->start + buf->len - 1, GFP_NOFS);
}
- trans->dirty = true;
/* this returns a buffer locked for blocking */
return buf;
}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 83b9c64ba76e..9e81d25dea70 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -136,7 +136,7 @@ struct tree_entry {
};
struct extent_page_data {
- struct bio *bio;
+ struct btrfs_bio_ctrl bio_ctrl;
/* tells writepage not to lock the state bits for this range
* it still does the unlocking
*/
@@ -185,10 +185,12 @@ int __must_check submit_one_bio(struct bio *bio, int mirror_num,
/* Cleanup unsubmitted bios */
static void end_write_bio(struct extent_page_data *epd, int ret)
{
- if (epd->bio) {
- epd->bio->bi_status = errno_to_blk_status(ret);
- bio_endio(epd->bio);
- epd->bio = NULL;
+ struct bio *bio = epd->bio_ctrl.bio;
+
+ if (bio) {
+ bio->bi_status = errno_to_blk_status(ret);
+ bio_endio(bio);
+ epd->bio_ctrl.bio = NULL;
}
}
@@ -201,9 +203,10 @@ static void end_write_bio(struct extent_page_data *epd, int ret)
static int __must_check flush_write_bio(struct extent_page_data *epd)
{
int ret = 0;
+ struct bio *bio = epd->bio_ctrl.bio;
- if (epd->bio) {
- ret = submit_one_bio(epd->bio, 0, 0);
+ if (bio) {
+ ret = submit_one_bio(bio, 0, 0);
/*
* Clean up of epd->bio is handled by its endio function.
* And endio is either triggered by successful bio execution
@@ -211,7 +214,7 @@ static int __must_check flush_write_bio(struct extent_page_data *epd)
* So at this point, no matter what happened, we don't need
* to clean up epd->bio.
*/
- epd->bio = NULL;
+ epd->bio_ctrl.bio = NULL;
}
return ret;
}
@@ -1805,10 +1808,130 @@ out:
return found;
}
+/*
+ * Process one page for __process_pages_contig().
+ *
+ * Return >0 if we hit @page == @locked_page.
+ * Return 0 if we updated the page status.
+ * Return -EGAIN if the we need to try again.
+ * (For PAGE_LOCK case but got dirty page or page not belong to mapping)
+ */
+static int process_one_page(struct btrfs_fs_info *fs_info,
+ struct address_space *mapping,
+ struct page *page, struct page *locked_page,
+ unsigned long page_ops, u64 start, u64 end)
+{
+ u32 len;
+
+ ASSERT(end + 1 - start != 0 && end + 1 - start < U32_MAX);
+ len = end + 1 - start;
+
+ if (page_ops & PAGE_SET_ORDERED)
+ btrfs_page_clamp_set_ordered(fs_info, page, start, len);
+ if (page_ops & PAGE_SET_ERROR)
+ btrfs_page_clamp_set_error(fs_info, page, start, len);
+ if (page_ops & PAGE_START_WRITEBACK) {
+ btrfs_page_clamp_clear_dirty(fs_info, page, start, len);
+ btrfs_page_clamp_set_writeback(fs_info, page, start, len);
+ }
+ if (page_ops & PAGE_END_WRITEBACK)
+ btrfs_page_clamp_clear_writeback(fs_info, page, start, len);
+
+ if (page == locked_page)
+ return 1;
+
+ if (page_ops & PAGE_LOCK) {
+ int ret;
+
+ ret = btrfs_page_start_writer_lock(fs_info, page, start, len);
+ if (ret)
+ return ret;
+ if (!PageDirty(page) || page->mapping != mapping) {
+ btrfs_page_end_writer_lock(fs_info, page, start, len);
+ return -EAGAIN;
+ }
+ }
+ if (page_ops & PAGE_UNLOCK)
+ btrfs_page_end_writer_lock(fs_info, page, start, len);
+ return 0;
+}
+
static int __process_pages_contig(struct address_space *mapping,
struct page *locked_page,
- pgoff_t start_index, pgoff_t end_index,
- unsigned long page_ops, pgoff_t *index_ret);
+ u64 start, u64 end, unsigned long page_ops,
+ u64 *processed_end)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(mapping->host->i_sb);
+ pgoff_t start_index = start >> PAGE_SHIFT;
+ pgoff_t end_index = end >> PAGE_SHIFT;
+ pgoff_t index = start_index;
+ unsigned long nr_pages = end_index - start_index + 1;
+ unsigned long pages_processed = 0;
+ struct page *pages[16];
+ int err = 0;
+ int i;
+
+ if (page_ops & PAGE_LOCK) {
+ ASSERT(page_ops == PAGE_LOCK);
+ ASSERT(processed_end && *processed_end == start);
+ }
+
+ if ((page_ops & PAGE_SET_ERROR) && nr_pages > 0)
+ mapping_set_error(mapping, -EIO);
+
+ while (nr_pages > 0) {
+ int found_pages;
+
+ found_pages = find_get_pages_contig(mapping, index,
+ min_t(unsigned long,
+ nr_pages, ARRAY_SIZE(pages)), pages);
+ if (found_pages == 0) {
+ /*
+ * Only if we're going to lock these pages, we can find
+ * nothing at @index.
+ */
+ ASSERT(page_ops & PAGE_LOCK);
+ err = -EAGAIN;
+ goto out;
+ }
+
+ for (i = 0; i < found_pages; i++) {
+ int process_ret;
+
+ process_ret = process_one_page(fs_info, mapping,
+ pages[i], locked_page, page_ops,
+ start, end);
+ if (process_ret < 0) {
+ for (; i < found_pages; i++)
+ put_page(pages[i]);
+ err = -EAGAIN;
+ goto out;
+ }
+ put_page(pages[i]);
+ pages_processed++;
+ }
+ nr_pages -= found_pages;
+ index += found_pages;
+ cond_resched();
+ }
+out:
+ if (err && processed_end) {
+ /*
+ * Update @processed_end. I know this is awful since it has
+ * two different return value patterns (inclusive vs exclusive).
+ *
+ * But the exclusive pattern is necessary if @start is 0, or we
+ * underflow and check against processed_end won't work as
+ * expected.
+ */
+ if (pages_processed)
+ *processed_end = min(end,
+ ((u64)(start_index + pages_processed) << PAGE_SHIFT) - 1);
+ else
+ *processed_end = start;
+ }
+ return err;
+}
static noinline void __unlock_for_delalloc(struct inode *inode,
struct page *locked_page,
@@ -1821,7 +1944,7 @@ static noinline void __unlock_for_delalloc(struct inode *inode,
if (index == locked_page->index && end_index == index)
return;
- __process_pages_contig(inode->i_mapping, locked_page, index, end_index,
+ __process_pages_contig(inode->i_mapping, locked_page, start, end,
PAGE_UNLOCK, NULL);
}
@@ -1831,19 +1954,19 @@ static noinline int lock_delalloc_pages(struct inode *inode,
u64 delalloc_end)
{
unsigned long index = delalloc_start >> PAGE_SHIFT;
- unsigned long index_ret = index;
unsigned long end_index = delalloc_end >> PAGE_SHIFT;
+ u64 processed_end = delalloc_start;
int ret;
ASSERT(locked_page);
if (index == locked_page->index && index == end_index)
return 0;
- ret = __process_pages_contig(inode->i_mapping, locked_page, index,
- end_index, PAGE_LOCK, &index_ret);
- if (ret == -EAGAIN)
+ ret = __process_pages_contig(inode->i_mapping, locked_page, delalloc_start,
+ delalloc_end, PAGE_LOCK, &processed_end);
+ if (ret == -EAGAIN && processed_end > delalloc_start)
__unlock_for_delalloc(inode, locked_page, delalloc_start,
- (u64)index_ret << PAGE_SHIFT);
+ processed_end);
return ret;
}
@@ -1936,84 +2059,6 @@ out_failed:
return found;
}
-static int __process_pages_contig(struct address_space *mapping,
- struct page *locked_page,
- pgoff_t start_index, pgoff_t end_index,
- unsigned long page_ops, pgoff_t *index_ret)
-{
- unsigned long nr_pages = end_index - start_index + 1;
- unsigned long pages_processed = 0;
- pgoff_t index = start_index;
- struct page *pages[16];
- unsigned ret;
- int err = 0;
- int i;
-
- if (page_ops & PAGE_LOCK) {
- ASSERT(page_ops == PAGE_LOCK);
- ASSERT(index_ret && *index_ret == start_index);
- }
-
- if ((page_ops & PAGE_SET_ERROR) && nr_pages > 0)
- mapping_set_error(mapping, -EIO);
-
- while (nr_pages > 0) {
- ret = find_get_pages_contig(mapping, index,
- min_t(unsigned long,
- nr_pages, ARRAY_SIZE(pages)), pages);
- if (ret == 0) {
- /*
- * Only if we're going to lock these pages,
- * can we find nothing at @index.
- */
- ASSERT(page_ops & PAGE_LOCK);
- err = -EAGAIN;
- goto out;
- }
-
- for (i = 0; i < ret; i++) {
- if (page_ops & PAGE_SET_PRIVATE2)
- SetPagePrivate2(pages[i]);
-
- if (locked_page && pages[i] == locked_page) {
- put_page(pages[i]);
- pages_processed++;
- continue;
- }
- if (page_ops & PAGE_START_WRITEBACK) {
- clear_page_dirty_for_io(pages[i]);
- set_page_writeback(pages[i]);
- }
- if (page_ops & PAGE_SET_ERROR)
- SetPageError(pages[i]);
- if (page_ops & PAGE_END_WRITEBACK)
- end_page_writeback(pages[i]);
- if (page_ops & PAGE_UNLOCK)
- unlock_page(pages[i]);
- if (page_ops & PAGE_LOCK) {
- lock_page(pages[i]);
- if (!PageDirty(pages[i]) ||
- pages[i]->mapping != mapping) {
- unlock_page(pages[i]);
- for (; i < ret; i++)
- put_page(pages[i]);
- err = -EAGAIN;
- goto out;
- }
- }
- put_page(pages[i]);
- pages_processed++;
- }
- nr_pages -= ret;
- index += ret;
- cond_resched();
- }
-out:
- if (err && index_ret)
- *index_ret = start_index + pages_processed - 1;
- return err;
-}
-
void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
struct page *locked_page,
u32 clear_bits, unsigned long page_ops)
@@ -2021,8 +2066,7 @@ void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
clear_extent_bit(&inode->io_tree, start, end, clear_bits, 1, 0, NULL);
__process_pages_contig(inode->vfs_inode.i_mapping, locked_page,
- start >> PAGE_SHIFT, end >> PAGE_SHIFT,
- page_ops, NULL);
+ start, end, page_ops, NULL);
}
/*
@@ -2381,13 +2425,6 @@ int clean_io_failure(struct btrfs_fs_info *fs_info,
BUG_ON(!failrec->this_mirror);
- if (failrec->in_validation) {
- /* there was no real error, just free the record */
- btrfs_debug(fs_info,
- "clean_io_failure: freeing dummy error at %llu",
- failrec->start);
- goto out;
- }
if (sb_rdonly(fs_info->sb))
goto out;
@@ -2449,7 +2486,7 @@ void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end)
}
static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode,
- u64 start, u64 end)
+ u64 start)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct io_failure_record *failrec;
@@ -2457,15 +2494,15 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode
struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+ const u32 sectorsize = fs_info->sectorsize;
int ret;
u64 logical;
failrec = get_state_failrec(failure_tree, start);
if (!IS_ERR(failrec)) {
btrfs_debug(fs_info,
- "Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu, validation=%d",
- failrec->logical, failrec->start, failrec->len,
- failrec->in_validation);
+ "Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu",
+ failrec->logical, failrec->start, failrec->len);
/*
* when data can be on disk more than twice, add to failrec here
* (e.g. with a list for failed_mirror) to make
@@ -2480,10 +2517,9 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode
return ERR_PTR(-ENOMEM);
failrec->start = start;
- failrec->len = end - start + 1;
+ failrec->len = sectorsize;
failrec->this_mirror = 0;
failrec->bio_flags = 0;
- failrec->in_validation = 0;
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, start, failrec->len);
@@ -2519,12 +2555,13 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode
free_extent_map(em);
/* Set the bits in the private failure tree */
- ret = set_extent_bits(failure_tree, start, end,
+ ret = set_extent_bits(failure_tree, start, start + sectorsize - 1,
EXTENT_LOCKED | EXTENT_DIRTY);
if (ret >= 0) {
ret = set_state_failrec(failure_tree, start, failrec);
/* Set the bits in the inode's tree */
- ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED);
+ ret = set_extent_bits(tree, start, start + sectorsize - 1,
+ EXTENT_DAMAGED);
} else if (ret < 0) {
kfree(failrec);
return ERR_PTR(ret);
@@ -2533,7 +2570,7 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode
return failrec;
}
-static bool btrfs_check_repairable(struct inode *inode, bool needs_validation,
+static bool btrfs_check_repairable(struct inode *inode,
struct io_failure_record *failrec,
int failed_mirror)
{
@@ -2553,39 +2590,22 @@ static bool btrfs_check_repairable(struct inode *inode, bool needs_validation,
return false;
}
+ /* The failure record should only contain one sector */
+ ASSERT(failrec->len == fs_info->sectorsize);
+
/*
- * there are two premises:
- * a) deliver good data to the caller
- * b) correct the bad sectors on disk
+ * There are two premises:
+ * a) deliver good data to the caller
+ * b) correct the bad sectors on disk
+ *
+ * Since we're only doing repair for one sector, we only need to get
+ * a good copy of the failed sector and if we succeed, we have setup
+ * everything for repair_io_failure to do the rest for us.
*/
- if (needs_validation) {
- /*
- * to fulfill b), we need to know the exact failing sectors, as
- * we don't want to rewrite any more than the failed ones. thus,
- * we need separate read requests for the failed bio
- *
- * if the following BUG_ON triggers, our validation request got
- * merged. we need separate requests for our algorithm to work.
- */
- BUG_ON(failrec->in_validation);
- failrec->in_validation = 1;
- failrec->this_mirror = failed_mirror;
- } else {
- /*
- * we're ready to fulfill a) and b) alongside. get a good copy
- * of the failed sector and if we succeed, we have setup
- * everything for repair_io_failure to do the rest for us.
- */
- if (failrec->in_validation) {
- BUG_ON(failrec->this_mirror != failed_mirror);
- failrec->in_validation = 0;
- failrec->this_mirror = 0;
- }
- failrec->failed_mirror = failed_mirror;
+ failrec->failed_mirror = failed_mirror;
+ failrec->this_mirror++;
+ if (failrec->this_mirror == failed_mirror)
failrec->this_mirror++;
- if (failrec->this_mirror == failed_mirror)
- failrec->this_mirror++;
- }
if (failrec->this_mirror > num_copies) {
btrfs_debug(fs_info,
@@ -2597,53 +2617,11 @@ static bool btrfs_check_repairable(struct inode *inode, bool needs_validation,
return true;
}
-static bool btrfs_io_needs_validation(struct inode *inode, struct bio *bio)
-{
- u64 len = 0;
- const u32 blocksize = inode->i_sb->s_blocksize;
-
- /*
- * If bi_status is BLK_STS_OK, then this was a checksum error, not an
- * I/O error. In this case, we already know exactly which sector was
- * bad, so we don't need to validate.
- */
- if (bio->bi_status == BLK_STS_OK)
- return false;
-
- /*
- * We need to validate each sector individually if the failed I/O was
- * for multiple sectors.
- *
- * There are a few possible bios that can end up here:
- * 1. A buffered read bio, which is not cloned.
- * 2. A direct I/O read bio, which is cloned.
- * 3. A (buffered or direct) repair bio, which is not cloned.
- *
- * For cloned bios (case 2), we can get the size from
- * btrfs_io_bio->iter; for non-cloned bios (cases 1 and 3), we can get
- * it from the bvecs.
- */
- if (bio_flagged(bio, BIO_CLONED)) {
- if (btrfs_io_bio(bio)->iter.bi_size > blocksize)
- return true;
- } else {
- struct bio_vec *bvec;
- int i;
-
- bio_for_each_bvec_all(bvec, bio, i) {
- len += bvec->bv_len;
- if (len > blocksize)
- return true;
- }
- }
- return false;
-}
-
-blk_status_t btrfs_submit_read_repair(struct inode *inode,
- struct bio *failed_bio, u32 bio_offset,
- struct page *page, unsigned int pgoff,
- u64 start, u64 end, int failed_mirror,
- submit_bio_hook_t *submit_bio_hook)
+int btrfs_repair_one_sector(struct inode *inode,
+ struct bio *failed_bio, u32 bio_offset,
+ struct page *page, unsigned int pgoff,
+ u64 start, int failed_mirror,
+ submit_bio_hook_t *submit_bio_hook)
{
struct io_failure_record *failrec;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@ -2651,7 +2629,6 @@ blk_status_t btrfs_submit_read_repair(struct inode *inode,
struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
struct btrfs_io_bio *failed_io_bio = btrfs_io_bio(failed_bio);
const int icsum = bio_offset >> fs_info->sectorsize_bits;
- bool need_validation;
struct bio *repair_bio;
struct btrfs_io_bio *repair_io_bio;
blk_status_t status;
@@ -2661,23 +2638,19 @@ blk_status_t btrfs_submit_read_repair(struct inode *inode,
BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
- failrec = btrfs_get_io_failure_record(inode, start, end);
+ failrec = btrfs_get_io_failure_record(inode, start);
if (IS_ERR(failrec))
- return errno_to_blk_status(PTR_ERR(failrec));
+ return PTR_ERR(failrec);
- need_validation = btrfs_io_needs_validation(inode, failed_bio);
- if (!btrfs_check_repairable(inode, need_validation, failrec,
- failed_mirror)) {
+ if (!btrfs_check_repairable(inode, failrec, failed_mirror)) {
free_io_failure(failure_tree, tree, failrec);
- return BLK_STS_IOERR;
+ return -EIO;
}
repair_bio = btrfs_io_bio_alloc(1);
repair_io_bio = btrfs_io_bio(repair_bio);
repair_bio->bi_opf = REQ_OP_READ;
- if (need_validation)
- repair_bio->bi_opf |= REQ_FAILFAST_DEV;
repair_bio->bi_end_io = failed_bio->bi_end_io;
repair_bio->bi_iter.bi_sector = failrec->logical >> 9;
repair_bio->bi_private = failed_bio->bi_private;
@@ -2695,8 +2668,8 @@ blk_status_t btrfs_submit_read_repair(struct inode *inode,
repair_io_bio->iter = repair_bio->bi_iter;
btrfs_debug(btrfs_sb(inode->i_sb),
-"repair read error: submitting new read to mirror %d, in_validation=%d",
- failrec->this_mirror, failrec->in_validation);
+ "repair read error: submitting new read to mirror %d",
+ failrec->this_mirror);
status = submit_bio_hook(inode, repair_bio, failrec->this_mirror,
failrec->bio_flags);
@@ -2704,17 +2677,114 @@ blk_status_t btrfs_submit_read_repair(struct inode *inode,
free_io_failure(failure_tree, tree, failrec);
bio_put(repair_bio);
}
- return status;
+ return blk_status_to_errno(status);
+}
+
+static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
+
+ ASSERT(page_offset(page) <= start &&
+ start + len <= page_offset(page) + PAGE_SIZE);
+
+ if (uptodate) {
+ btrfs_page_set_uptodate(fs_info, page, start, len);
+ } else {
+ btrfs_page_clear_uptodate(fs_info, page, start, len);
+ btrfs_page_set_error(fs_info, page, start, len);
+ }
+
+ if (fs_info->sectorsize == PAGE_SIZE)
+ unlock_page(page);
+ else
+ btrfs_subpage_end_reader(fs_info, page, start, len);
+}
+
+static blk_status_t submit_read_repair(struct inode *inode,
+ struct bio *failed_bio, u32 bio_offset,
+ struct page *page, unsigned int pgoff,
+ u64 start, u64 end, int failed_mirror,
+ unsigned int error_bitmap,
+ submit_bio_hook_t *submit_bio_hook)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ const u32 sectorsize = fs_info->sectorsize;
+ const int nr_bits = (end + 1 - start) >> fs_info->sectorsize_bits;
+ int error = 0;
+ int i;
+
+ BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
+
+ /* We're here because we had some read errors or csum mismatch */
+ ASSERT(error_bitmap);
+
+ /*
+ * We only get called on buffered IO, thus page must be mapped and bio
+ * must not be cloned.
+ */
+ ASSERT(page->mapping && !bio_flagged(failed_bio, BIO_CLONED));
+
+ /* Iterate through all the sectors in the range */
+ for (i = 0; i < nr_bits; i++) {
+ const unsigned int offset = i * sectorsize;
+ struct extent_state *cached = NULL;
+ bool uptodate = false;
+ int ret;
+
+ if (!(error_bitmap & (1U << i))) {
+ /*
+ * This sector has no error, just end the page read
+ * and unlock the range.
+ */
+ uptodate = true;
+ goto next;
+ }
+
+ ret = btrfs_repair_one_sector(inode, failed_bio,
+ bio_offset + offset,
+ page, pgoff + offset, start + offset,
+ failed_mirror, submit_bio_hook);
+ if (!ret) {
+ /*
+ * We have submitted the read repair, the page release
+ * will be handled by the endio function of the
+ * submitted repair bio.
+ * Thus we don't need to do any thing here.
+ */
+ continue;
+ }
+ /*
+ * Repair failed, just record the error but still continue.
+ * Or the remaining sectors will not be properly unlocked.
+ */
+ if (!error)
+ error = ret;
+next:
+ end_page_read(page, uptodate, start + offset, sectorsize);
+ if (uptodate)
+ set_extent_uptodate(&BTRFS_I(inode)->io_tree,
+ start + offset,
+ start + offset + sectorsize - 1,
+ &cached, GFP_ATOMIC);
+ unlock_extent_cached_atomic(&BTRFS_I(inode)->io_tree,
+ start + offset,
+ start + offset + sectorsize - 1,
+ &cached);
+ }
+ return errno_to_blk_status(error);
}
/* lots and lots of room for performance fixes in the end_bio funcs */
void end_extent_writepage(struct page *page, int err, u64 start, u64 end)
{
+ struct btrfs_inode *inode;
int uptodate = (err == 0);
int ret = 0;
- btrfs_writepage_endio_finish_ordered(page, start, end, uptodate);
+ ASSERT(page && page->mapping);
+ inode = BTRFS_I(page->mapping->host);
+ btrfs_writepage_endio_finish_ordered(inode, page, start, end, uptodate);
if (!uptodate) {
ClearPageUptodate(page);
@@ -2747,25 +2817,20 @@ static void end_bio_extent_writepage(struct bio *bio)
struct page *page = bvec->bv_page;
struct inode *inode = page->mapping->host;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ const u32 sectorsize = fs_info->sectorsize;
- /* We always issue full-page reads, but if some block
- * in a page fails to read, blk_update_request() will
- * advance bv_offset and adjust bv_len to compensate.
- * Print a warning for nonzero offsets, and an error
- * if they don't add up to a full page. */
- if (bvec->bv_offset || bvec->bv_len != PAGE_SIZE) {
- if (bvec->bv_offset + bvec->bv_len != PAGE_SIZE)
- btrfs_err(fs_info,
- "partial page write in btrfs with offset %u and length %u",
- bvec->bv_offset, bvec->bv_len);
- else
- btrfs_info(fs_info,
- "incomplete page write in btrfs with offset %u and length %u",
- bvec->bv_offset, bvec->bv_len);
- }
+ /* Our read/write should always be sector aligned. */
+ if (!IS_ALIGNED(bvec->bv_offset, sectorsize))
+ btrfs_err(fs_info,
+ "partial page write in btrfs with offset %u and length %u",
+ bvec->bv_offset, bvec->bv_len);
+ else if (!IS_ALIGNED(bvec->bv_len, sectorsize))
+ btrfs_info(fs_info,
+ "incomplete page write with offset %u and length %u",
+ bvec->bv_offset, bvec->bv_len);
- start = page_offset(page);
- end = start + bvec->bv_offset + bvec->bv_len - 1;
+ start = page_offset(page) + bvec->bv_offset;
+ end = start + bvec->bv_len - 1;
if (first_bvec) {
btrfs_record_physical_zoned(inode, start, bio);
@@ -2773,7 +2838,8 @@ static void end_bio_extent_writepage(struct bio *bio)
}
end_extent_writepage(page, error, start, end);
- end_page_writeback(page);
+
+ btrfs_page_clear_writeback(fs_info, page, start, bvec->bv_len);
}
bio_put(bio);
@@ -2862,30 +2928,6 @@ static void begin_page_read(struct btrfs_fs_info *fs_info, struct page *page)
btrfs_subpage_start_reader(fs_info, page, page_offset(page), PAGE_SIZE);
}
-static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len)
-{
- struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
-
- ASSERT(page_offset(page) <= start &&
- start + len <= page_offset(page) + PAGE_SIZE);
-
- if (uptodate) {
- btrfs_page_set_uptodate(fs_info, page, start, len);
- } else {
- btrfs_page_clear_uptodate(fs_info, page, start, len);
- btrfs_page_set_error(fs_info, page, start, len);
- }
-
- if (fs_info->sectorsize == PAGE_SIZE)
- unlock_page(page);
- else if (is_data_inode(page->mapping->host))
- /*
- * For subpage data, unlock the page if we're the last reader.
- * For subpage metadata, page lock is not utilized for read.
- */
- btrfs_subpage_end_reader(fs_info, page, start, len);
-}
-
/*
* Find extent buffer for a givne bytenr.
*
@@ -2929,7 +2971,6 @@ static struct extent_buffer *find_extent_buffer_readpage(
static void end_bio_extent_readpage(struct bio *bio)
{
struct bio_vec *bvec;
- int uptodate = !bio->bi_status;
struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
struct extent_io_tree *tree, *failure_tree;
struct processed_extent processed = { 0 };
@@ -2944,10 +2985,12 @@ static void end_bio_extent_readpage(struct bio *bio)
ASSERT(!bio_flagged(bio, BIO_CLONED));
bio_for_each_segment_all(bvec, bio, iter_all) {
+ bool uptodate = !bio->bi_status;
struct page *page = bvec->bv_page;
struct inode *inode = page->mapping->host;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
const u32 sectorsize = fs_info->sectorsize;
+ unsigned int error_bitmap = (unsigned int)-1;
u64 start;
u64 end;
u32 len;
@@ -2982,14 +3025,16 @@ static void end_bio_extent_readpage(struct bio *bio)
mirror = io_bio->mirror_num;
if (likely(uptodate)) {
- if (is_data_inode(inode))
- ret = btrfs_verify_data_csum(io_bio,
+ if (is_data_inode(inode)) {
+ error_bitmap = btrfs_verify_data_csum(io_bio,
bio_offset, page, start, end);
- else
+ ret = error_bitmap;
+ } else {
ret = btrfs_validate_metadata_buffer(io_bio,
page, start, end, mirror);
+ }
if (ret)
- uptodate = 0;
+ uptodate = false;
else
clean_io_failure(BTRFS_I(inode)->root->fs_info,
failure_tree, tree, start,
@@ -3001,27 +3046,18 @@ static void end_bio_extent_readpage(struct bio *bio)
goto readpage_ok;
if (is_data_inode(inode)) {
-
/*
- * The generic bio_readpage_error handles errors the
- * following way: If possible, new read requests are
- * created and submitted and will end up in
- * end_bio_extent_readpage as well (if we're lucky,
- * not in the !uptodate case). In that case it returns
- * 0 and we just go on with the next page in our bio.
- * If it can't handle the error it will return -EIO and
- * we remain responsible for that page.
+ * btrfs_submit_read_repair() will handle all the good
+ * and bad sectors, we just continue to the next bvec.
*/
- if (!btrfs_submit_read_repair(inode, bio, bio_offset,
- page,
- start - page_offset(page),
- start, end, mirror,
- btrfs_submit_data_bio)) {
- uptodate = !bio->bi_status;
- ASSERT(bio_offset + len > bio_offset);
- bio_offset += len;
- continue;
- }
+ submit_read_repair(inode, bio, bio_offset, page,
+ start - page_offset(page), start,
+ end, mirror, error_bitmap,
+ btrfs_submit_data_bio);
+
+ ASSERT(bio_offset + len > bio_offset);
+ bio_offset += len;
+ continue;
} else {
struct extent_buffer *eb;
@@ -3151,42 +3187,99 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size)
*
* Return true if successfully page added. Otherwise, return false.
*/
-static bool btrfs_bio_add_page(struct bio *bio, struct page *page,
+static bool btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl,
+ struct page *page,
u64 disk_bytenr, unsigned int size,
unsigned int pg_offset,
- unsigned long prev_bio_flags,
unsigned long bio_flags)
{
+ struct bio *bio = bio_ctrl->bio;
+ u32 bio_size = bio->bi_iter.bi_size;
const sector_t sector = disk_bytenr >> SECTOR_SHIFT;
bool contig;
int ret;
- if (prev_bio_flags != bio_flags)
+ ASSERT(bio);
+ /* The limit should be calculated when bio_ctrl->bio is allocated */
+ ASSERT(bio_ctrl->len_to_oe_boundary && bio_ctrl->len_to_stripe_boundary);
+ if (bio_ctrl->bio_flags != bio_flags)
return false;
- if (prev_bio_flags & EXTENT_BIO_COMPRESSED)
+ if (bio_ctrl->bio_flags & EXTENT_BIO_COMPRESSED)
contig = bio->bi_iter.bi_sector == sector;
else
contig = bio_end_sector(bio) == sector;
if (!contig)
return false;
- if (btrfs_bio_fits_in_stripe(page, size, bio, bio_flags))
+ if (bio_size + size > bio_ctrl->len_to_oe_boundary ||
+ bio_size + size > bio_ctrl->len_to_stripe_boundary)
return false;
- if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
- struct page *first_page = bio_first_bvec_all(bio)->bv_page;
-
- if (!btrfs_bio_fits_in_ordered_extent(first_page, bio, size))
- return false;
+ if (bio_op(bio) == REQ_OP_ZONE_APPEND)
ret = bio_add_zone_append_page(bio, page, size, pg_offset);
- } else {
+ else
ret = bio_add_page(bio, page, size, pg_offset);
- }
return ret == size;
}
+static int calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl,
+ struct btrfs_inode *inode)
+{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct btrfs_io_geometry geom;
+ struct btrfs_ordered_extent *ordered;
+ struct extent_map *em;
+ u64 logical = (bio_ctrl->bio->bi_iter.bi_sector << SECTOR_SHIFT);
+ int ret;
+
+ /*
+ * Pages for compressed extent are never submitted to disk directly,
+ * thus it has no real boundary, just set them to U32_MAX.
+ *
+ * The split happens for real compressed bio, which happens in
+ * btrfs_submit_compressed_read/write().
+ */
+ if (bio_ctrl->bio_flags & EXTENT_BIO_COMPRESSED) {
+ bio_ctrl->len_to_oe_boundary = U32_MAX;
+ bio_ctrl->len_to_stripe_boundary = U32_MAX;
+ return 0;
+ }
+ em = btrfs_get_chunk_map(fs_info, logical, fs_info->sectorsize);
+ if (IS_ERR(em))
+ return PTR_ERR(em);
+ ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio_ctrl->bio),
+ logical, &geom);
+ free_extent_map(em);
+ if (ret < 0) {
+ return ret;
+ }
+ if (geom.len > U32_MAX)
+ bio_ctrl->len_to_stripe_boundary = U32_MAX;
+ else
+ bio_ctrl->len_to_stripe_boundary = (u32)geom.len;
+
+ if (!btrfs_is_zoned(fs_info) ||
+ bio_op(bio_ctrl->bio) != REQ_OP_ZONE_APPEND) {
+ bio_ctrl->len_to_oe_boundary = U32_MAX;
+ return 0;
+ }
+
+ ASSERT(fs_info->max_zone_append_size > 0);
+ /* Ordered extent not yet created, so we're good */
+ ordered = btrfs_lookup_ordered_extent(inode, logical);
+ if (!ordered) {
+ bio_ctrl->len_to_oe_boundary = U32_MAX;
+ return 0;
+ }
+
+ bio_ctrl->len_to_oe_boundary = min_t(u32, U32_MAX,
+ ordered->disk_bytenr + ordered->disk_num_bytes - logical);
+ btrfs_put_ordered_extent(ordered);
+ return 0;
+}
+
/*
* @opf: bio REQ_OP_* and REQ_* flags as one value
* @wbc: optional writeback control for io accounting
@@ -3203,12 +3296,11 @@ static bool btrfs_bio_add_page(struct bio *bio, struct page *page,
*/
static int submit_extent_page(unsigned int opf,
struct writeback_control *wbc,
+ struct btrfs_bio_ctrl *bio_ctrl,
struct page *page, u64 disk_bytenr,
size_t size, unsigned long pg_offset,
- struct bio **bio_ret,
bio_end_io_t end_io_func,
int mirror_num,
- unsigned long prev_bio_flags,
unsigned long bio_flags,
bool force_bio_submit)
{
@@ -3219,19 +3311,19 @@ static int submit_extent_page(unsigned int opf,
struct extent_io_tree *tree = &inode->io_tree;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
- ASSERT(bio_ret);
+ ASSERT(bio_ctrl);
- if (*bio_ret) {
- bio = *bio_ret;
+ ASSERT(pg_offset < PAGE_SIZE && size <= PAGE_SIZE &&
+ pg_offset + size <= PAGE_SIZE);
+ if (bio_ctrl->bio) {
+ bio = bio_ctrl->bio;
if (force_bio_submit ||
- !btrfs_bio_add_page(bio, page, disk_bytenr, io_size,
- pg_offset, prev_bio_flags, bio_flags)) {
- ret = submit_one_bio(bio, mirror_num, prev_bio_flags);
- if (ret < 0) {
- *bio_ret = NULL;
+ !btrfs_bio_add_page(bio_ctrl, page, disk_bytenr, io_size,
+ pg_offset, bio_flags)) {
+ ret = submit_one_bio(bio, mirror_num, bio_ctrl->bio_flags);
+ bio_ctrl->bio = NULL;
+ if (ret < 0)
return ret;
- }
- bio = NULL;
} else {
if (wbc)
wbc_account_cgroup_owner(wbc, page, io_size);
@@ -3254,22 +3346,18 @@ static int submit_extent_page(unsigned int opf,
wbc_account_cgroup_owner(wbc, page, io_size);
}
if (btrfs_is_zoned(fs_info) && bio_op(bio) == REQ_OP_ZONE_APPEND) {
- struct extent_map *em;
- struct map_lookup *map;
+ struct btrfs_device *device;
- em = btrfs_get_chunk_map(fs_info, disk_bytenr, io_size);
- if (IS_ERR(em))
- return PTR_ERR(em);
+ device = btrfs_zoned_get_device(fs_info, disk_bytenr, io_size);
+ if (IS_ERR(device))
+ return PTR_ERR(device);
- map = em->map_lookup;
- /* We only support single profile for now */
- ASSERT(map->num_stripes == 1);
- btrfs_io_bio(bio)->device = map->stripes[0].dev;
-
- free_extent_map(em);
+ btrfs_io_bio(bio)->device = device;
}
- *bio_ret = bio;
+ bio_ctrl->bio = bio;
+ bio_ctrl->bio_flags = bio_flags;
+ ret = calc_bio_boundaries(bio_ctrl, inode);
return ret;
}
@@ -3382,7 +3470,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
* return 0 on success, otherwise return error
*/
int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
- struct bio **bio, unsigned long *bio_flags,
+ struct btrfs_bio_ctrl *bio_ctrl,
unsigned int read_flags, u64 *prev_em_start)
{
struct inode *inode = page->mapping->host;
@@ -3558,15 +3646,13 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
}
ret = submit_extent_page(REQ_OP_READ | read_flags, NULL,
- page, disk_bytenr, iosize,
- pg_offset, bio,
+ bio_ctrl, page, disk_bytenr, iosize,
+ pg_offset,
end_bio_extent_readpage, 0,
- *bio_flags,
this_bio_flag,
force_bio_submit);
if (!ret) {
nr++;
- *bio_flags = this_bio_flag;
} else {
unlock_extent(tree, cur, cur + iosize - 1);
end_page_read(page, false, cur, iosize);
@@ -3580,11 +3666,10 @@ out:
}
static inline void contiguous_readpages(struct page *pages[], int nr_pages,
- u64 start, u64 end,
- struct extent_map **em_cached,
- struct bio **bio,
- unsigned long *bio_flags,
- u64 *prev_em_start)
+ u64 start, u64 end,
+ struct extent_map **em_cached,
+ struct btrfs_bio_ctrl *bio_ctrl,
+ u64 *prev_em_start)
{
struct btrfs_inode *inode = BTRFS_I(pages[0]->mapping->host);
int index;
@@ -3592,7 +3677,7 @@ static inline void contiguous_readpages(struct page *pages[], int nr_pages,
btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
for (index = 0; index < nr_pages; index++) {
- btrfs_do_readpage(pages[index], em_cached, bio, bio_flags,
+ btrfs_do_readpage(pages[index], em_cached, bio_ctrl,
REQ_RAHEAD, prev_em_start);
put_page(pages[index]);
}
@@ -3680,6 +3765,54 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
}
/*
+ * Find the first byte we need to write.
+ *
+ * For subpage, one page can contain several sectors, and
+ * __extent_writepage_io() will just grab all extent maps in the page
+ * range and try to submit all non-inline/non-compressed extents.
+ *
+ * This is a big problem for subpage, we shouldn't re-submit already written
+ * data at all.
+ * This function will lookup subpage dirty bit to find which range we really
+ * need to submit.
+ *
+ * Return the next dirty range in [@start, @end).
+ * If no dirty range is found, @start will be page_offset(page) + PAGE_SIZE.
+ */
+static void find_next_dirty_byte(struct btrfs_fs_info *fs_info,
+ struct page *page, u64 *start, u64 *end)
+{
+ struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ u64 orig_start = *start;
+ /* Declare as unsigned long so we can use bitmap ops */
+ unsigned long dirty_bitmap;
+ unsigned long flags;
+ int nbits = (orig_start - page_offset(page)) >> fs_info->sectorsize_bits;
+ int range_start_bit = nbits;
+ int range_end_bit;
+
+ /*
+ * For regular sector size == page size case, since one page only
+ * contains one sector, we return the page offset directly.
+ */
+ if (fs_info->sectorsize == PAGE_SIZE) {
+ *start = page_offset(page);
+ *end = page_offset(page) + PAGE_SIZE;
+ return;
+ }
+
+ /* We should have the page locked, but just in case */
+ spin_lock_irqsave(&subpage->lock, flags);
+ dirty_bitmap = subpage->dirty_bitmap;
+ spin_unlock_irqrestore(&subpage->lock, flags);
+
+ bitmap_next_set_region(&dirty_bitmap, &range_start_bit, &range_end_bit,
+ BTRFS_SUBPAGE_BITMAP_SIZE);
+ *start = page_offset(page) + range_start_bit * fs_info->sectorsize;
+ *end = page_offset(page) + range_end_bit * fs_info->sectorsize;
+}
+
+/*
* helper for __extent_writepage. This calls the writepage start hooks,
* and does the loop to map the page into extents and bios.
*
@@ -3696,7 +3829,6 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
int *nr_ret)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
- struct extent_io_tree *tree = &inode->io_tree;
u64 start = page_offset(page);
u64 end = start + PAGE_SIZE - 1;
u64 cur = start;
@@ -3727,15 +3859,26 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
while (cur <= end) {
u64 disk_bytenr;
u64 em_end;
+ u64 dirty_range_start = cur;
+ u64 dirty_range_end;
u32 iosize;
if (cur >= i_size) {
- btrfs_writepage_endio_finish_ordered(page, cur, end, 1);
+ btrfs_writepage_endio_finish_ordered(inode, page, cur,
+ end, 1);
break;
}
+
+ find_next_dirty_byte(fs_info, page, &dirty_range_start,
+ &dirty_range_end);
+ if (cur < dirty_range_start) {
+ cur = dirty_range_start;
+ continue;
+ }
+
em = btrfs_get_extent(inode, NULL, 0, cur, end - cur + 1);
if (IS_ERR_OR_NULL(em)) {
- SetPageError(page);
+ btrfs_page_set_error(fs_info, page, cur, end - cur + 1);
ret = PTR_ERR_OR_ZERO(em);
break;
}
@@ -3750,10 +3893,13 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
disk_bytenr = em->block_start + extent_offset;
- /* Note that em_end from extent_map_end() is exclusive */
- iosize = min(em_end, end + 1) - cur;
+ /*
+ * Note that em_end from extent_map_end() and dirty_range_end from
+ * find_next_dirty_byte() are all exclusive
+ */
+ iosize = min(min(em_end, end + 1), dirty_range_end) - cur;
- if (btrfs_use_zone_append(inode, em))
+ if (btrfs_use_zone_append(inode, em->block_start))
opf = REQ_OP_ZONE_APPEND;
free_extent_map(em);
@@ -3768,28 +3914,38 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
if (compressed)
nr++;
else
- btrfs_writepage_endio_finish_ordered(page, cur,
- cur + iosize - 1, 1);
+ btrfs_writepage_endio_finish_ordered(inode,
+ page, cur, cur + iosize - 1, 1);
cur += iosize;
continue;
}
- btrfs_set_range_writeback(tree, cur, cur + iosize - 1);
+ btrfs_set_range_writeback(inode, cur, cur + iosize - 1);
if (!PageWriteback(page)) {
btrfs_err(inode->root->fs_info,
"page %lu not writeback, cur %llu end %llu",
page->index, cur, end);
}
- ret = submit_extent_page(opf | write_flags, wbc, page,
+ /*
+ * Although the PageDirty bit is cleared before entering this
+ * function, subpage dirty bit is not cleared.
+ * So clear subpage dirty bit here so next time we won't submit
+ * page for range already written to disk.
+ */
+ btrfs_page_clear_dirty(fs_info, page, cur, iosize);
+
+ ret = submit_extent_page(opf | write_flags, wbc,
+ &epd->bio_ctrl, page,
disk_bytenr, iosize,
- cur - page_offset(page), &epd->bio,
+ cur - page_offset(page),
end_bio_extent_writepage,
- 0, 0, 0, false);
+ 0, 0, false);
if (ret) {
- SetPageError(page);
+ btrfs_page_set_error(fs_info, page, cur, iosize);
if (PageWriteback(page))
- end_page_writeback(page);
+ btrfs_page_clear_writeback(fs_info, page, cur,
+ iosize);
}
cur += iosize;
@@ -4098,12 +4254,15 @@ static struct extent_buffer *find_extent_buffer_nolock(
* Unlike end_bio_extent_buffer_writepage(), we only call end_page_writeback()
* after all extent buffers in the page has finished their writeback.
*/
-static void end_bio_subpage_eb_writepage(struct btrfs_fs_info *fs_info,
- struct bio *bio)
+static void end_bio_subpage_eb_writepage(struct bio *bio)
{
+ struct btrfs_fs_info *fs_info;
struct bio_vec *bvec;
struct bvec_iter_all iter_all;
+ fs_info = btrfs_sb(bio_first_page_all(bio)->mapping->host->i_sb);
+ ASSERT(fs_info->sectorsize < PAGE_SIZE);
+
ASSERT(!bio_flagged(bio, BIO_CLONED));
bio_for_each_segment_all(bvec, bio, iter_all) {
struct page *page = bvec->bv_page;
@@ -4154,16 +4313,11 @@ static void end_bio_subpage_eb_writepage(struct btrfs_fs_info *fs_info,
static void end_bio_extent_buffer_writepage(struct bio *bio)
{
- struct btrfs_fs_info *fs_info;
struct bio_vec *bvec;
struct extent_buffer *eb;
int done;
struct bvec_iter_all iter_all;
- fs_info = btrfs_sb(bio_first_page_all(bio)->mapping->host->i_sb);
- if (fs_info->sectorsize < PAGE_SIZE)
- return end_bio_subpage_eb_writepage(fs_info, bio);
-
ASSERT(!bio_flagged(bio, BIO_CLONED));
bio_for_each_segment_all(bvec, bio, iter_all) {
struct page *page = bvec->bv_page;
@@ -4189,12 +4343,34 @@ static void end_bio_extent_buffer_writepage(struct bio *bio)
bio_put(bio);
}
+static void prepare_eb_write(struct extent_buffer *eb)
+{
+ u32 nritems;
+ unsigned long start;
+ unsigned long end;
+
+ clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags);
+ atomic_set(&eb->io_pages, num_extent_pages(eb));
+
+ /* Set btree blocks beyond nritems with 0 to avoid stale content */
+ nritems = btrfs_header_nritems(eb);
+ if (btrfs_header_level(eb) > 0) {
+ end = btrfs_node_key_ptr_offset(nritems);
+ memzero_extent_buffer(eb, end, eb->len - end);
+ } else {
+ /*
+ * Leaf:
+ * header 0 1 2 .. N ... data_N .. data_2 data_1 data_0
+ */
+ start = btrfs_item_nr_offset(nritems);
+ end = BTRFS_LEAF_DATA_OFFSET + leaf_data_end(eb);
+ memzero_extent_buffer(eb, start, end - start);
+ }
+}
+
/*
* Unlike the work in write_one_eb(), we rely completely on extent locking.
* Page locking is only utilized at minimum to keep the VMM code happy.
- *
- * Caller should still call write_one_eb() other than this function directly.
- * As write_one_eb() has extra preparation before submitting the extent buffer.
*/
static int write_one_subpage_eb(struct extent_buffer *eb,
struct writeback_control *wbc,
@@ -4206,6 +4382,8 @@ static int write_one_subpage_eb(struct extent_buffer *eb,
bool no_dirty_ebs = false;
int ret;
+ prepare_eb_write(eb);
+
/* clear_page_dirty_for_io() in subpage helper needs page locked */
lock_page(page);
btrfs_subpage_set_writeback(fs_info, page, eb->start, eb->len);
@@ -4216,10 +4394,10 @@ static int write_one_subpage_eb(struct extent_buffer *eb,
if (no_dirty_ebs)
clear_page_dirty_for_io(page);
- ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc, page,
- eb->start, eb->len, eb->start - page_offset(page),
- &epd->bio, end_bio_extent_buffer_writepage, 0, 0, 0,
- false);
+ ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc,
+ &epd->bio_ctrl, page, eb->start, eb->len,
+ eb->start - page_offset(page),
+ end_bio_subpage_eb_writepage, 0, 0, false);
if (ret) {
btrfs_subpage_clear_writeback(fs_info, page, eb->start, eb->len);
set_btree_ioerr(page, eb);
@@ -4244,45 +4422,23 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
struct extent_page_data *epd)
{
u64 disk_bytenr = eb->start;
- u32 nritems;
int i, num_pages;
- unsigned long start, end;
unsigned int write_flags = wbc_to_write_flags(wbc) | REQ_META;
int ret = 0;
- clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags);
- num_pages = num_extent_pages(eb);
- atomic_set(&eb->io_pages, num_pages);
-
- /* set btree blocks beyond nritems with 0 to avoid stale content. */
- nritems = btrfs_header_nritems(eb);
- if (btrfs_header_level(eb) > 0) {
- end = btrfs_node_key_ptr_offset(nritems);
-
- memzero_extent_buffer(eb, end, eb->len - end);
- } else {
- /*
- * leaf:
- * header 0 1 2 .. N ... data_N .. data_2 data_1 data_0
- */
- start = btrfs_item_nr_offset(nritems);
- end = BTRFS_LEAF_DATA_OFFSET + leaf_data_end(eb);
- memzero_extent_buffer(eb, start, end - start);
- }
-
- if (eb->fs_info->sectorsize < PAGE_SIZE)
- return write_one_subpage_eb(eb, wbc, epd);
+ prepare_eb_write(eb);
+ num_pages = num_extent_pages(eb);
for (i = 0; i < num_pages; i++) {
struct page *p = eb->pages[i];
clear_page_dirty_for_io(p);
set_page_writeback(p);
ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc,
- p, disk_bytenr, PAGE_SIZE, 0,
- &epd->bio,
+ &epd->bio_ctrl, p, disk_bytenr,
+ PAGE_SIZE, 0,
end_bio_extent_buffer_writepage,
- 0, 0, 0, false);
+ 0, 0, false);
if (ret) {
set_btree_ioerr(p, eb);
if (PageWriteback(p))
@@ -4386,7 +4542,7 @@ static int submit_eb_subpage(struct page *page,
free_extent_buffer(eb);
goto cleanup;
}
- ret = write_one_eb(eb, wbc, epd);
+ ret = write_one_subpage_eb(eb, wbc, epd);
free_extent_buffer(eb);
if (ret < 0)
goto cleanup;
@@ -4498,7 +4654,7 @@ int btree_write_cache_pages(struct address_space *mapping,
{
struct extent_buffer *eb_context = NULL;
struct extent_page_data epd = {
- .bio = NULL,
+ .bio_ctrl = { 0 },
.extent_locked = 0,
.sync_io = wbc->sync_mode == WB_SYNC_ALL,
};
@@ -4780,7 +4936,7 @@ int extent_write_full_page(struct page *page, struct writeback_control *wbc)
{
int ret;
struct extent_page_data epd = {
- .bio = NULL,
+ .bio_ctrl = { 0 },
.extent_locked = 0,
.sync_io = wbc->sync_mode == WB_SYNC_ALL,
};
@@ -4807,7 +4963,7 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end,
PAGE_SHIFT;
struct extent_page_data epd = {
- .bio = NULL,
+ .bio_ctrl = { 0 },
.extent_locked = 1,
.sync_io = mode == WB_SYNC_ALL,
};
@@ -4827,8 +4983,8 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end,
if (clear_page_dirty_for_io(page))
ret = __extent_writepage(page, &wbc_writepages, &epd);
else {
- btrfs_writepage_endio_finish_ordered(page, start,
- start + PAGE_SIZE - 1, 1);
+ btrfs_writepage_endio_finish_ordered(BTRFS_I(inode),
+ page, start, start + PAGE_SIZE - 1, 1);
unlock_page(page);
}
put_page(page);
@@ -4850,7 +5006,7 @@ int extent_writepages(struct address_space *mapping,
{
int ret = 0;
struct extent_page_data epd = {
- .bio = NULL,
+ .bio_ctrl = { 0 },
.extent_locked = 0,
.sync_io = wbc->sync_mode == WB_SYNC_ALL,
};
@@ -4867,8 +5023,7 @@ int extent_writepages(struct address_space *mapping,
void extent_readahead(struct readahead_control *rac)
{
- struct bio *bio = NULL;
- unsigned long bio_flags = 0;
+ struct btrfs_bio_ctrl bio_ctrl = { 0 };
struct page *pagepool[16];
struct extent_map *em_cached = NULL;
u64 prev_em_start = (u64)-1;
@@ -4879,14 +5034,14 @@ void extent_readahead(struct readahead_control *rac)
u64 contig_end = contig_start + readahead_batch_length(rac) - 1;
contiguous_readpages(pagepool, nr, contig_start, contig_end,
- &em_cached, &bio, &bio_flags, &prev_em_start);
+ &em_cached, &bio_ctrl, &prev_em_start);
}
if (em_cached)
free_extent_map(em_cached);
- if (bio) {
- if (submit_one_bio(bio, 0, bio_flags))
+ if (bio_ctrl.bio) {
+ if (submit_one_bio(bio_ctrl.bio, 0, bio_ctrl.bio_flags))
return;
}
}
@@ -5429,6 +5584,12 @@ static bool page_range_has_eb(struct btrfs_fs_info *fs_info, struct page *page)
subpage = (struct btrfs_subpage *)page->private;
if (atomic_read(&subpage->eb_refs))
return true;
+ /*
+ * Even there is no eb refs here, we may still have
+ * end_page_read() call relying on page::private.
+ */
+ if (atomic_read(&subpage->readers))
+ return true;
}
return false;
}
@@ -5489,7 +5650,7 @@ static void detach_extent_buffer_page(struct extent_buffer *eb, struct page *pag
/*
* We can only detach the page private if there are no other ebs in the
- * page range.
+ * page range and no unfinished IO.
*/
if (!page_range_has_eb(fs_info, page))
btrfs_detach_subpage(fs_info, page);
@@ -6176,7 +6337,7 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait,
struct btrfs_fs_info *fs_info = eb->fs_info;
struct extent_io_tree *io_tree;
struct page *page = eb->pages[0];
- struct bio *bio = NULL;
+ struct btrfs_bio_ctrl bio_ctrl = { 0 };
int ret = 0;
ASSERT(!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags));
@@ -6184,10 +6345,8 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait,
io_tree = &BTRFS_I(fs_info->btree_inode)->io_tree;
if (wait == WAIT_NONE) {
- ret = try_lock_extent(io_tree, eb->start,
- eb->start + eb->len - 1);
- if (ret <= 0)
- return ret;
+ if (!try_lock_extent(io_tree, eb->start, eb->start + eb->len - 1))
+ return -EAGAIN;
} else {
ret = lock_extent(io_tree, eb->start, eb->start + eb->len - 1);
if (ret < 0)
@@ -6209,9 +6368,11 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait,
check_buffer_tree_ref(eb);
btrfs_subpage_clear_error(fs_info, page, eb->start, eb->len);
- ret = submit_extent_page(REQ_OP_READ | REQ_META, NULL, page, eb->start,
- eb->len, eb->start - page_offset(page), &bio,
- end_bio_extent_readpage, mirror_num, 0, 0,
+ btrfs_subpage_start_reader(fs_info, page, eb->start, eb->len);
+ ret = submit_extent_page(REQ_OP_READ | REQ_META, NULL, &bio_ctrl,
+ page, eb->start, eb->len,
+ eb->start - page_offset(page),
+ end_bio_extent_readpage, mirror_num, 0,
true);
if (ret) {
/*
@@ -6221,10 +6382,11 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait,
*/
atomic_dec(&eb->io_pages);
}
- if (bio) {
+ if (bio_ctrl.bio) {
int tmp;
- tmp = submit_one_bio(bio, mirror_num, 0);
+ tmp = submit_one_bio(bio_ctrl.bio, mirror_num, 0);
+ bio_ctrl.bio = NULL;
if (tmp < 0)
return tmp;
}
@@ -6247,8 +6409,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num)
int all_uptodate = 1;
int num_pages;
unsigned long num_reads = 0;
- struct bio *bio = NULL;
- unsigned long bio_flags = 0;
+ struct btrfs_bio_ctrl bio_ctrl = { 0 };
if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
return 0;
@@ -6312,9 +6473,9 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num)
ClearPageError(page);
err = submit_extent_page(REQ_OP_READ | REQ_META, NULL,
- page, page_offset(page), PAGE_SIZE, 0,
- &bio, end_bio_extent_readpage,
- mirror_num, 0, 0, false);
+ &bio_ctrl, page, page_offset(page),
+ PAGE_SIZE, 0, end_bio_extent_readpage,
+ mirror_num, 0, false);
if (err) {
/*
* We failed to submit the bio so it's the
@@ -6331,8 +6492,9 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num)
}
}
- if (bio) {
- err = submit_one_bio(bio, mirror_num, bio_flags);
+ if (bio_ctrl.bio) {
+ err = submit_one_bio(bio_ctrl.bio, mirror_num, bio_ctrl.bio_flags);
+ bio_ctrl.bio = NULL;
if (err)
return err;
}
@@ -6515,9 +6677,10 @@ void write_extent_buffer_chunk_tree_uuid(const struct extent_buffer *eb,
char *kaddr;
assert_eb_page_uptodate(eb, eb->pages[0]);
- kaddr = page_address(eb->pages[0]) + get_eb_offset_in_page(eb, 0);
- memcpy(kaddr + offsetof(struct btrfs_header, chunk_tree_uuid), srcv,
- BTRFS_FSID_SIZE);
+ kaddr = page_address(eb->pages[0]) +
+ get_eb_offset_in_page(eb, offsetof(struct btrfs_header,
+ chunk_tree_uuid));
+ memcpy(kaddr, srcv, BTRFS_FSID_SIZE);
}
void write_extent_buffer_fsid(const struct extent_buffer *eb, const void *srcv)
@@ -6525,9 +6688,9 @@ void write_extent_buffer_fsid(const struct extent_buffer *eb, const void *srcv)
char *kaddr;
assert_eb_page_uptodate(eb, eb->pages[0]);
- kaddr = page_address(eb->pages[0]) + get_eb_offset_in_page(eb, 0);
- memcpy(kaddr + offsetof(struct btrfs_header, fsid), srcv,
- BTRFS_FSID_SIZE);
+ kaddr = page_address(eb->pages[0]) +
+ get_eb_offset_in_page(eb, offsetof(struct btrfs_header, fsid));
+ memcpy(kaddr, srcv, BTRFS_FSID_SIZE);
}
void write_extent_buffer(const struct extent_buffer *eb, const void *srcv,
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 227215a5722c..62027f551b44 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -39,7 +39,7 @@ enum {
/* Page starts writeback, clear dirty bit and set writeback bit */
#define PAGE_START_WRITEBACK (1 << 1)
#define PAGE_END_WRITEBACK (1 << 2)
-#define PAGE_SET_PRIVATE2 (1 << 3)
+#define PAGE_SET_ORDERED (1 << 3)
#define PAGE_SET_ERROR (1 << 4)
#define PAGE_LOCK (1 << 5)
@@ -102,6 +102,17 @@ struct extent_buffer {
};
/*
+ * Structure to record info about the bio being assembled, and other info like
+ * how many bytes are there before stripe/ordered extent boundary.
+ */
+struct btrfs_bio_ctrl {
+ struct bio *bio;
+ unsigned long bio_flags;
+ u32 len_to_stripe_boundary;
+ u32 len_to_oe_boundary;
+};
+
+/*
* Structure to record how many bytes and which ranges are set/cleared
*/
struct extent_changeset {
@@ -169,7 +180,7 @@ int try_release_extent_buffer(struct page *page);
int __must_check submit_one_bio(struct bio *bio, int mirror_num,
unsigned long bio_flags);
int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
- struct bio **bio, unsigned long *bio_flags,
+ struct btrfs_bio_ctrl *bio_ctrl,
unsigned int read_flags, u64 *prev_em_start);
int extent_write_full_page(struct page *page, struct writeback_control *wbc);
int extent_write_locked_range(struct inode *inode, u64 start, u64 end,
@@ -281,7 +292,7 @@ int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num);
* When IO fails, either with EIO or csum verification fails, we
* try other mirrors that might have a good copy of the data. This
* io_failure_record is used to record state as we go through all the
- * mirrors. If another mirror has good data, the page is set up to date
+ * mirrors. If another mirror has good data, the sector is set up to date
* and things continue. If a good mirror can't be found, the original
* bio end_io callback is called to indicate things have failed.
*/
@@ -293,15 +304,13 @@ struct io_failure_record {
unsigned long bio_flags;
int this_mirror;
int failed_mirror;
- int in_validation;
};
-
-blk_status_t btrfs_submit_read_repair(struct inode *inode,
- struct bio *failed_bio, u32 bio_offset,
- struct page *page, unsigned int pgoff,
- u64 start, u64 end, int failed_mirror,
- submit_bio_hook_t *submit_bio_hook);
+int btrfs_repair_one_sector(struct inode *inode,
+ struct bio *failed_bio, u32 bio_offset,
+ struct page *page, unsigned int pgoff,
+ u64 start, int failed_mirror,
+ submit_bio_hook_t *submit_bio_hook);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
bool find_lock_delalloc_range(struct inode *inode,
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 294602f139ef..df6631eefc65 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -618,7 +618,7 @@ fail:
* @file_start: offset in file this bio begins to describe
* @contig: Boolean. If true/1 means all bio vecs in this bio are
* contiguous and they begin at @file_start in the file. False/0
- * means this bio can contains potentially discontigous bio vecs
+ * means this bio can contain potentially discontiguous bio vecs
* so the logical offset of each should be calculated separately.
*/
blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio,
@@ -788,7 +788,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
u64 end_byte = bytenr + len;
u64 csum_end;
struct extent_buffer *leaf;
- int ret;
+ int ret = 0;
const u32 csum_size = fs_info->csum_size;
u32 blocksize_bits = fs_info->sectorsize_bits;
@@ -806,6 +806,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret > 0) {
+ ret = 0;
if (path->slots[0] == 0)
break;
path->slots[0]--;
@@ -862,7 +863,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
ret = btrfs_del_items(trans, root, path,
path->slots[0], del_nr);
if (ret)
- goto out;
+ break;
if (key.offset == bytenr)
break;
} else if (key.offset < bytenr && csum_end > end_byte) {
@@ -906,8 +907,9 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
ret = btrfs_split_item(trans, root, path, &key, offset);
if (ret && ret != -EAGAIN) {
btrfs_abort_transaction(trans, ret);
- goto out;
+ break;
}
+ ret = 0;
key.offset = end_byte - 1;
} else {
@@ -917,12 +919,41 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
}
btrfs_release_path(path);
}
- ret = 0;
-out:
btrfs_free_path(path);
return ret;
}
+static int find_next_csum_offset(struct btrfs_root *root,
+ struct btrfs_path *path,
+ u64 *next_offset)
+{
+ const u32 nritems = btrfs_header_nritems(path->nodes[0]);
+ struct btrfs_key found_key;
+ int slot = path->slots[0] + 1;
+ int ret;
+
+ if (nritems == 0 || slot >= nritems) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0) {
+ return ret;
+ } else if (ret > 0) {
+ *next_offset = (u64)-1;
+ return 0;
+ }
+ slot = path->slots[0];
+ }
+
+ btrfs_item_key_to_cpu(path->nodes[0], &found_key, slot);
+
+ if (found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
+ found_key.type != BTRFS_EXTENT_CSUM_KEY)
+ *next_offset = (u64)-1;
+ else
+ *next_offset = found_key.offset;
+
+ return 0;
+}
+
int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_ordered_sum *sums)
@@ -938,7 +969,6 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
u64 total_bytes = 0;
u64 csum_offset;
u64 bytenr;
- u32 nritems;
u32 ins_size;
int index = 0;
int found_next;
@@ -981,26 +1011,10 @@ again:
goto insert;
}
} else {
- int slot = path->slots[0] + 1;
- /* we didn't find a csum item, insert one */
- nritems = btrfs_header_nritems(path->nodes[0]);
- if (!nritems || (path->slots[0] >= nritems - 1)) {
- ret = btrfs_next_leaf(root, path);
- if (ret < 0) {
- goto out;
- } else if (ret > 0) {
- found_next = 1;
- goto insert;
- }
- slot = path->slots[0];
- }
- btrfs_item_key_to_cpu(path->nodes[0], &found_key, slot);
- if (found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
- found_key.type != BTRFS_EXTENT_CSUM_KEY) {
- found_next = 1;
- goto insert;
- }
- next_offset = found_key.offset;
+ /* We didn't find a csum item, insert one. */
+ ret = find_next_csum_offset(root, path, &next_offset);
+ if (ret < 0)
+ goto out;
found_next = 1;
goto insert;
}
@@ -1056,8 +1070,48 @@ extend_csum:
tmp = sums->len - total_bytes;
tmp >>= fs_info->sectorsize_bits;
WARN_ON(tmp < 1);
+ extend_nr = max_t(int, 1, tmp);
+
+ /*
+ * A log tree can already have checksum items with a subset of
+ * the checksums we are trying to log. This can happen after
+ * doing a sequence of partial writes into prealloc extents and
+ * fsyncs in between, with a full fsync logging a larger subrange
+ * of an extent for which a previous fast fsync logged a smaller
+ * subrange. And this happens in particular due to merging file
+ * extent items when we complete an ordered extent for a range
+ * covered by a prealloc extent - this is done at
+ * btrfs_mark_extent_written().
+ *
+ * So if we try to extend the previous checksum item, which has
+ * a range that ends at the start of the range we want to insert,
+ * make sure we don't extend beyond the start offset of the next
+ * checksum item. If we are at the last item in the leaf, then
+ * forget the optimization of extending and add a new checksum
+ * item - it is not worth the complexity of releasing the path,
+ * getting the first key for the next leaf, repeat the btree
+ * search, etc, because log trees are temporary anyway and it
+ * would only save a few bytes of leaf space.
+ */
+ if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
+ if (path->slots[0] + 1 >=
+ btrfs_header_nritems(path->nodes[0])) {
+ ret = find_next_csum_offset(root, path, &next_offset);
+ if (ret < 0)
+ goto out;
+ found_next = 1;
+ goto insert;
+ }
+
+ ret = find_next_csum_offset(root, path, &next_offset);
+ if (ret < 0)
+ goto out;
+
+ tmp = (next_offset - bytenr) >> fs_info->sectorsize_bits;
+ if (tmp <= INT_MAX)
+ extend_nr = min_t(int, extend_nr, tmp);
+ }
- extend_nr = max_t(int, 1, (int)tmp);
diff = (csum_offset + extend_nr) * csum_size;
diff = min(diff,
MAX_CSUM_ITEMS(fs_info, csum_size) * csum_size);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 3b10d98b4ebb..28a05ba47060 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -28,6 +28,7 @@
#include "compression.h"
#include "delalloc-space.h"
#include "reflink.h"
+#include "subpage.h"
static struct kmem_cache *btrfs_inode_defrag_cachep;
/*
@@ -482,6 +483,7 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
start_pos = round_down(pos, fs_info->sectorsize);
num_bytes = round_up(write_bytes + pos - start_pos,
fs_info->sectorsize);
+ ASSERT(num_bytes <= U32_MAX);
end_of_last_block = start_pos + num_bytes - 1;
@@ -500,9 +502,10 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
for (i = 0; i < num_pages; i++) {
struct page *p = pages[i];
- SetPageUptodate(p);
+
+ btrfs_page_clamp_set_uptodate(fs_info, p, start_pos, num_bytes);
ClearPageChecked(p);
- set_page_dirty(p);
+ btrfs_page_clamp_set_dirty(fs_info, p, start_pos, num_bytes);
}
/*
@@ -1094,7 +1097,7 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
int del_nr = 0;
int del_slot = 0;
int recow;
- int ret;
+ int ret = 0;
u64 ino = btrfs_ino(inode);
path = btrfs_alloc_path();
@@ -1315,7 +1318,7 @@ again:
}
out:
btrfs_free_path(path);
- return 0;
+ return ret;
}
/*
@@ -2483,6 +2486,17 @@ static int btrfs_punch_hole_lock_range(struct inode *inode,
const u64 lockend,
struct extent_state **cached_state)
{
+ /*
+ * For subpage case, if the range is not at page boundary, we could
+ * have pages at the leading/tailing part of the range.
+ * This could lead to dead loop since filemap_range_has_page()
+ * will always return true.
+ * So here we need to do extra page alignment for
+ * filemap_range_has_page().
+ */
+ const u64 page_lockstart = round_up(lockstart, PAGE_SIZE);
+ const u64 page_lockend = round_down(lockend + 1, PAGE_SIZE) - 1;
+
while (1) {
struct btrfs_ordered_extent *ordered;
int ret;
@@ -2503,7 +2517,7 @@ static int btrfs_punch_hole_lock_range(struct inode *inode,
(ordered->file_offset + ordered->num_bytes <= lockstart ||
ordered->file_offset > lockend)) &&
!filemap_range_has_page(inode->i_mapping,
- lockstart, lockend)) {
+ page_lockstart, page_lockend)) {
if (ordered)
btrfs_put_ordered_extent(ordered);
break;
@@ -3034,22 +3048,20 @@ struct falloc_range {
*/
static int add_falloc_range(struct list_head *head, u64 start, u64 len)
{
- struct falloc_range *prev = NULL;
struct falloc_range *range = NULL;
- if (list_empty(head))
- goto insert;
-
- /*
- * As fallocate iterate by bytenr order, we only need to check
- * the last range.
- */
- prev = list_entry(head->prev, struct falloc_range, list);
- if (prev->start + prev->len == start) {
- prev->len += len;
- return 0;
+ if (!list_empty(head)) {
+ /*
+ * As fallocate iterates by bytenr order, we only need to check
+ * the last range.
+ */
+ range = list_last_entry(head, struct falloc_range, list);
+ if (range->start + range->len == start) {
+ range->len += len;
+ return 0;
+ }
}
-insert:
+
range = kmalloc(sizeof(*range), GFP_KERNEL);
if (!range)
return -ENOMEM;
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 4806295116d8..2131ae5b9ed7 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -327,7 +327,7 @@ int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans,
* need to check for -EAGAIN.
*/
ret = btrfs_truncate_inode_items(trans, root, BTRFS_I(inode),
- 0, BTRFS_EXTENT_DATA_KEY);
+ 0, BTRFS_EXTENT_DATA_KEY, NULL);
if (ret)
goto fail;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index c6164ae16e2a..e6eb20987351 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -51,6 +51,7 @@
#include "block-group.h"
#include "space-info.h"
#include "zoned.h"
+#include "subpage.h"
struct btrfs_iget_args {
u64 ino;
@@ -166,22 +167,47 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
struct page *page;
while (index <= end_index) {
+ /*
+ * For locked page, we will call end_extent_writepage() on it
+ * in run_delalloc_range() for the error handling. That
+ * end_extent_writepage() function will call
+ * btrfs_mark_ordered_io_finished() to clear page Ordered and
+ * run the ordered extent accounting.
+ *
+ * Here we can't just clear the Ordered bit, or
+ * btrfs_mark_ordered_io_finished() would skip the accounting
+ * for the page range, and the ordered extent will never finish.
+ */
+ if (index == (page_offset(locked_page) >> PAGE_SHIFT)) {
+ index++;
+ continue;
+ }
page = find_get_page(inode->vfs_inode.i_mapping, index);
index++;
if (!page)
continue;
- ClearPagePrivate2(page);
+
+ /*
+ * Here we just clear all Ordered bits for every page in the
+ * range, then __endio_write_update_ordered() will handle
+ * the ordered extent accounting for the range.
+ */
+ btrfs_page_clamp_clear_ordered(inode->root->fs_info, page,
+ offset, bytes);
put_page(page);
}
+ /* The locked page covers the full range, nothing needs to be done */
+ if (bytes + offset <= page_offset(locked_page) + PAGE_SIZE)
+ return;
/*
* In case this page belongs to the delalloc range being instantiated
* then skip it, since the first page of a range is going to be
* properly cleaned up by the caller of run_delalloc_range
*/
if (page_start >= offset && page_end <= (offset + bytes - 1)) {
- offset += PAGE_SIZE;
- bytes -= PAGE_SIZE;
+ bytes = offset + bytes - page_offset(locked_page) - PAGE_SIZE;
+ offset = page_offset(locked_page) + PAGE_SIZE;
}
return __endio_write_update_ordered(inode, offset, bytes, false);
@@ -603,7 +629,7 @@ again:
* inode has not been flagged as nocompress. This flag can
* change at any time if we discover bad compression ratios.
*/
- if (inode_need_compress(BTRFS_I(inode), start, end)) {
+ if (nr_pages > 1 && inode_need_compress(BTRFS_I(inode), start, end)) {
WARN_ON(pages);
pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
if (!pages) {
@@ -946,7 +972,8 @@ retry:
const u64 end = start + async_extent->ram_size - 1;
p->mapping = inode->vfs_inode.i_mapping;
- btrfs_writepage_endio_finish_ordered(p, start, end, 0);
+ btrfs_writepage_endio_finish_ordered(inode, p, start,
+ end, 0);
p->mapping = NULL;
extent_clear_unlock_delalloc(inode, start, end, NULL, 0,
@@ -1064,7 +1091,8 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
* our outstanding extent for clearing delalloc for this
* range.
*/
- extent_clear_unlock_delalloc(inode, start, end, NULL,
+ extent_clear_unlock_delalloc(inode, start, end,
+ locked_page,
EXTENT_LOCKED | EXTENT_DELALLOC |
EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
@@ -1072,6 +1100,19 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
*nr_written = *nr_written +
(end - start + PAGE_SIZE) / PAGE_SIZE;
*page_started = 1;
+ /*
+ * locked_page is locked by the caller of
+ * writepage_delalloc(), not locked by
+ * __process_pages_contig().
+ *
+ * We can't let __process_pages_contig() to unlock it,
+ * as it doesn't have any subpage::writers recorded.
+ *
+ * Here we manually unlock the page, since the caller
+ * can't use page_started to determine if it's an
+ * inline extent or a compressed extent.
+ */
+ unlock_page(locked_page);
goto out;
} else if (ret < 0) {
goto out_unlock;
@@ -1150,15 +1191,16 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
- /* we're not doing compressed IO, don't unlock the first
- * page (which the caller expects to stay locked), don't
- * clear any dirty bits and don't set any writeback bits
+ /*
+ * We're not doing compressed IO, don't unlock the first page
+ * (which the caller expects to stay locked), don't clear any
+ * dirty bits and don't set any writeback bits
*
- * Do set the Private2 bit so we know this page was properly
- * setup for writepage
+ * Do set the Ordered (Private2) bit so we know this page was
+ * properly setup for writepage.
*/
page_ops = unlock ? PAGE_UNLOCK : 0;
- page_ops |= PAGE_SET_PRIVATE2;
+ page_ops |= PAGE_SET_ORDERED;
extent_clear_unlock_delalloc(inode, start, start + ram_size - 1,
locked_page,
@@ -1822,7 +1864,7 @@ out_check:
locked_page, EXTENT_LOCKED |
EXTENT_DELALLOC |
EXTENT_CLEAR_DATA_RESV,
- PAGE_UNLOCK | PAGE_SET_PRIVATE2);
+ PAGE_UNLOCK | PAGE_SET_ORDERED);
cur_offset = extent_end;
@@ -2193,26 +2235,22 @@ int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio,
struct inode *inode = page->mapping->host;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
u64 logical = bio->bi_iter.bi_sector << 9;
+ u32 bio_len = bio->bi_iter.bi_size;
struct extent_map *em;
- u64 length = 0;
- u64 map_length;
int ret = 0;
struct btrfs_io_geometry geom;
if (bio_flags & EXTENT_BIO_COMPRESSED)
return 0;
- length = bio->bi_iter.bi_size;
- map_length = length;
- em = btrfs_get_chunk_map(fs_info, logical, map_length);
+ em = btrfs_get_chunk_map(fs_info, logical, fs_info->sectorsize);
if (IS_ERR(em))
return PTR_ERR(em);
- ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio), logical,
- map_length, &geom);
+ ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio), logical, &geom);
if (ret < 0)
goto out;
- if (geom.len < length + size)
+ if (geom.len < bio_len + size)
ret = 1;
out:
free_extent_map(em);
@@ -2233,33 +2271,6 @@ static blk_status_t btrfs_submit_bio_start(struct inode *inode, struct bio *bio,
return btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0);
}
-bool btrfs_bio_fits_in_ordered_extent(struct page *page, struct bio *bio,
- unsigned int size)
-{
- struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
- struct btrfs_ordered_extent *ordered;
- u64 len = bio->bi_iter.bi_size + size;
- bool ret = true;
-
- ASSERT(btrfs_is_zoned(fs_info));
- ASSERT(fs_info->max_zone_append_size > 0);
- ASSERT(bio_op(bio) == REQ_OP_ZONE_APPEND);
-
- /* Ordered extent not yet created, so we're good */
- ordered = btrfs_lookup_ordered_extent(inode, page_offset(page));
- if (!ordered)
- return ret;
-
- if ((bio->bi_iter.bi_sector << SECTOR_SHIFT) + len >
- ordered->disk_bytenr + ordered->disk_num_bytes)
- ret = false;
-
- btrfs_put_ordered_extent(ordered);
-
- return ret;
-}
-
static blk_status_t extract_ordered_extent(struct btrfs_inode *inode,
struct bio *bio, loff_t file_offset)
{
@@ -2601,7 +2612,7 @@ again:
lock_extent_bits(&inode->io_tree, page_start, page_end, &cached_state);
/* already ordered? We're done */
- if (PagePrivate2(page))
+ if (PageOrdered(page))
goto out_reserved;
ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE);
@@ -2676,8 +2687,8 @@ int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end)
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_writepage_fixup *fixup;
- /* this page is properly in the ordered list */
- if (TestClearPagePrivate2(page))
+ /* This page has ordered extent covering it already */
+ if (PageOrdered(page))
return 0;
/*
@@ -2773,7 +2784,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
/*
* If we dropped an inline extent here, we know the range where it is
* was not marked with the EXTENT_DELALLOC_NEW bit, so we update the
- * number of bytes only for that range contaning the inline extent.
+ * number of bytes only for that range containing the inline extent.
* The remaining of the range will be processed when clearning the
* EXTENT_DELALLOC_BIT bit through the ordered extent completion.
*/
@@ -3000,6 +3011,18 @@ out:
if (ret || truncated) {
u64 unwritten_start = start;
+ /*
+ * If we failed to finish this ordered extent for any reason we
+ * need to make sure BTRFS_ORDERED_IOERR is set on the ordered
+ * extent, and mark the inode with the error if it wasn't
+ * already set. Any error during writeback would have already
+ * set the mapping error, so we need to set it if we're the ones
+ * marking this ordered extent as failed.
+ */
+ if (ret && !test_and_set_bit(BTRFS_ORDERED_IOERR,
+ &ordered_extent->flags))
+ mapping_set_error(ordered_extent->inode->i_mapping, -EIO);
+
if (truncated)
unwritten_start += logical_len;
clear_extent_uptodate(io_tree, unwritten_start, end, NULL);
@@ -3057,28 +3080,14 @@ static void finish_ordered_fn(struct btrfs_work *work)
btrfs_finish_ordered_io(ordered_extent);
}
-void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start,
+void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode,
+ struct page *page, u64 start,
u64 end, int uptodate)
{
- struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
- struct btrfs_ordered_extent *ordered_extent = NULL;
- struct btrfs_workqueue *wq;
+ trace_btrfs_writepage_end_io_hook(inode, start, end, uptodate);
- trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
-
- ClearPagePrivate2(page);
- if (!btrfs_dec_test_ordered_pending(inode, &ordered_extent, start,
- end - start + 1, uptodate))
- return;
-
- if (btrfs_is_free_space_inode(inode))
- wq = fs_info->endio_freespace_worker;
- else
- wq = fs_info->endio_write_workers;
-
- btrfs_init_work(&ordered_extent->work, finish_ordered_fn, NULL, NULL);
- btrfs_queue_work(wq, &ordered_extent->work);
+ btrfs_mark_ordered_io_finished(inode, page, start, end + 1 - start,
+ finish_ordered_fn, uptodate);
}
/*
@@ -3140,15 +3149,19 @@ zeroit:
* @bio_offset: offset to the beginning of the bio (in bytes)
* @start: file offset of the range start
* @end: file offset of the range end (inclusive)
+ *
+ * Return a bitmap where bit set means a csum mismatch, and bit not set means
+ * csum match.
*/
-int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset,
- struct page *page, u64 start, u64 end)
+unsigned int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset,
+ struct page *page, u64 start, u64 end)
{
struct inode *inode = page->mapping->host;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct btrfs_root *root = BTRFS_I(inode)->root;
const u32 sectorsize = root->fs_info->sectorsize;
u32 pg_off;
+ unsigned int result = 0;
if (PageChecked(page)) {
ClearPageChecked(page);
@@ -3176,10 +3189,14 @@ int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset,
ret = check_data_csum(inode, io_bio, bio_offset, page, pg_off,
page_offset(page) + pg_off);
- if (ret < 0)
- return -EIO;
+ if (ret < 0) {
+ const int nr_bit = (pg_off - offset_in_page(start)) >>
+ root->fs_info->sectorsize_bits;
+
+ result |= (1U << nr_bit);
+ }
}
- return 0;
+ return result;
}
/*
@@ -4097,7 +4114,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
* This is a placeholder inode for a subvolume we didn't have a
* reference to at the time of the snapshot creation. In the meantime
* we could have renamed the real subvol link into our snapshot, so
- * depending on btrfs_del_root_ref to return -ENOENT here is incorret.
+ * depending on btrfs_del_root_ref to return -ENOENT here is incorrect.
* Instead simply lookup the dir_index_item for this entry so we can
* remove it. Otherwise we know we have a ref to the root and we can
* call btrfs_del_root_ref, and it _shouldn't_ fail.
@@ -4452,20 +4469,36 @@ out:
#define NEED_TRUNCATE_BLOCK 1
/*
- * this can truncate away extent items, csum items and directory items.
- * It starts at a high offset and removes keys until it can't find
- * any higher than new_size
+ * Remove inode items from a given root.
*
- * csum items that cross the new i_size are truncated to the new size
- * as well.
+ * @trans: A transaction handle.
+ * @root: The root from which to remove items.
+ * @inode: The inode whose items we want to remove.
+ * @new_size: The new i_size for the inode. This is only applicable when
+ * @min_type is BTRFS_EXTENT_DATA_KEY, must be 0 otherwise.
+ * @min_type: The minimum key type to remove. All keys with a type
+ * greater than this value are removed and all keys with
+ * this type are removed only if their offset is >= @new_size.
+ * @extents_found: Output parameter that will contain the number of file
+ * extent items that were removed or adjusted to the new
+ * inode i_size. The caller is responsible for initializing
+ * the counter. Also, it can be NULL if the caller does not
+ * need this counter.
*
- * min_type is the minimum key type to truncate down to. If set to 0, this
- * will kill all the items on this inode, including the INODE_ITEM_KEY.
+ * Remove all keys associated with the inode from the given root that have a key
+ * with a type greater than or equals to @min_type. When @min_type has a value of
+ * BTRFS_EXTENT_DATA_KEY, only remove file extent items that have an offset value
+ * greater than or equals to @new_size. If a file extent item that starts before
+ * @new_size and ends after it is found, its length is adjusted.
+ *
+ * Returns: 0 on success, < 0 on error and NEED_TRUNCATE_BLOCK when @min_type is
+ * BTRFS_EXTENT_DATA_KEY and the caller must truncate the last block.
*/
int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_inode *inode,
- u64 new_size, u32 min_type)
+ u64 new_size, u32 min_type,
+ u64 *extents_found)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_path *path;
@@ -4611,6 +4644,9 @@ search_again:
if (found_type != BTRFS_EXTENT_DATA_KEY)
goto delete;
+ if (extents_found != NULL)
+ (*extents_found)++;
+
if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
u64 num_dec;
@@ -4929,7 +4965,7 @@ again:
flush_dcache_page(page);
}
ClearPageChecked(page);
- set_page_dirty(page);
+ btrfs_page_set_dirty(fs_info, page, block_start, block_end + 1 - block_start);
unlock_extent_cached(io_tree, block_start, block_end, &cached_state);
if (only_release_metadata)
@@ -5443,7 +5479,7 @@ void btrfs_evict_inode(struct inode *inode)
trans->block_rsv = rsv;
ret = btrfs_truncate_inode_items(trans, root, BTRFS_I(inode),
- 0, 0);
+ 0, 0, NULL);
trans->block_rsv = &fs_info->trans_block_rsv;
btrfs_end_transaction(trans);
btrfs_btree_balance_dirty(fs_info);
@@ -7786,7 +7822,7 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
iomap->bdev = fs_info->fs_devices->latest_bdev;
iomap->length = len;
- if (write && btrfs_use_zone_append(BTRFS_I(inode), em))
+ if (write && btrfs_use_zone_append(BTRFS_I(inode), em->block_start))
iomap->flags |= IOMAP_F_ZONE_APPEND;
free_extent_map(em);
@@ -7925,19 +7961,17 @@ static blk_status_t btrfs_check_read_dio_bio(struct inode *inode,
btrfs_ino(BTRFS_I(inode)),
pgoff);
} else {
- blk_status_t status;
+ int ret;
ASSERT((start - io_bio->logical) < UINT_MAX);
- status = btrfs_submit_read_repair(inode,
- &io_bio->bio,
- start - io_bio->logical,
- bvec.bv_page, pgoff,
- start,
- start + sectorsize - 1,
- io_bio->mirror_num,
- submit_dio_repair_bio);
- if (status)
- err = status;
+ ret = btrfs_repair_one_sector(inode,
+ &io_bio->bio,
+ start - io_bio->logical,
+ bvec.bv_page, pgoff,
+ start, io_bio->mirror_num,
+ submit_dio_repair_bio);
+ if (ret)
+ err = errno_to_blk_status(ret);
}
start += sectorsize;
ASSERT(bio_offset + sectorsize > bio_offset);
@@ -7952,41 +7986,8 @@ static void __endio_write_update_ordered(struct btrfs_inode *inode,
const u64 offset, const u64 bytes,
const bool uptodate)
{
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
- struct btrfs_ordered_extent *ordered = NULL;
- struct btrfs_workqueue *wq;
- u64 ordered_offset = offset;
- u64 ordered_bytes = bytes;
- u64 last_offset;
-
- if (btrfs_is_free_space_inode(inode))
- wq = fs_info->endio_freespace_worker;
- else
- wq = fs_info->endio_write_workers;
-
- while (ordered_offset < offset + bytes) {
- last_offset = ordered_offset;
- if (btrfs_dec_test_first_ordered_pending(inode, &ordered,
- &ordered_offset,
- ordered_bytes,
- uptodate)) {
- btrfs_init_work(&ordered->work, finish_ordered_fn, NULL,
- NULL);
- btrfs_queue_work(wq, &ordered->work);
- }
-
- /* No ordered extent found in the range, exit */
- if (ordered_offset == last_offset)
- return;
- /*
- * Our bio might span multiple ordered extents. In this case
- * we keep going until we have accounted the whole dio.
- */
- if (ordered_offset < offset + bytes) {
- ordered_bytes = offset + bytes - ordered_offset;
- ordered = NULL;
- }
- }
+ btrfs_mark_ordered_io_finished(inode, NULL, offset, bytes,
+ finish_ordered_fn, uptodate);
}
static blk_status_t btrfs_submit_bio_start_direct_io(struct inode *inode,
@@ -8160,7 +8161,7 @@ static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap,
goto out_err_em;
}
ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(dio_bio),
- logical, submit_len, &geom);
+ logical, &geom);
if (ret) {
status = errno_to_blk_status(ret);
goto out_err_em;
@@ -8264,15 +8265,14 @@ int btrfs_readpage(struct file *file, struct page *page)
struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
u64 start = page_offset(page);
u64 end = start + PAGE_SIZE - 1;
- unsigned long bio_flags = 0;
- struct bio *bio = NULL;
+ struct btrfs_bio_ctrl bio_ctrl = { 0 };
int ret;
btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
- ret = btrfs_do_readpage(page, NULL, &bio, &bio_flags, 0, NULL);
- if (bio)
- ret = submit_one_bio(bio, 0, bio_flags);
+ ret = btrfs_do_readpage(page, NULL, &bio_ctrl, 0, NULL);
+ if (bio_ctrl.bio)
+ ret = submit_one_bio(bio_ctrl.bio, 0, bio_ctrl.bio_flags);
return ret;
}
@@ -8341,9 +8341,9 @@ static int btrfs_migratepage(struct address_space *mapping,
if (page_has_private(page))
attach_page_private(newpage, detach_page_private(page));
- if (PagePrivate2(page)) {
- ClearPagePrivate2(page);
- SetPagePrivate2(newpage);
+ if (PageOrdered(page)) {
+ ClearPageOrdered(page);
+ SetPageOrdered(newpage);
}
if (mode != MIGRATE_SYNC_NO_COPY)
@@ -8358,27 +8358,42 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
unsigned int length)
{
struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct extent_io_tree *tree = &inode->io_tree;
- struct btrfs_ordered_extent *ordered;
struct extent_state *cached_state = NULL;
u64 page_start = page_offset(page);
u64 page_end = page_start + PAGE_SIZE - 1;
- u64 start;
- u64 end;
+ u64 cur;
int inode_evicting = inode->vfs_inode.i_state & I_FREEING;
- bool found_ordered = false;
- bool completed_ordered = false;
/*
- * we have the page locked, so new writeback can't start,
- * and the dirty bit won't be cleared while we are here.
+ * We have page locked so no new ordered extent can be created on this
+ * page, nor bio can be submitted for this page.
*
- * Wait for IO on this page so that we can safely clear
- * the PagePrivate2 bit and do ordered accounting
+ * But already submitted bio can still be finished on this page.
+ * Furthermore, endio function won't skip page which has Ordered
+ * (Private2) already cleared, so it's possible for endio and
+ * invalidatepage to do the same ordered extent accounting twice
+ * on one page.
+ *
+ * So here we wait for any submitted bios to finish, so that we won't
+ * do double ordered extent accounting on the same page.
*/
wait_on_page_writeback(page);
- if (offset) {
+ /*
+ * For subpage case, we have call sites like
+ * btrfs_punch_hole_lock_range() which passes range not aligned to
+ * sectorsize.
+ * If the range doesn't cover the full page, we don't need to and
+ * shouldn't clear page extent mapped, as page->private can still
+ * record subpage dirty bits for other part of the range.
+ *
+ * For cases that can invalidate the full even the range doesn't
+ * cover the full page, like invalidating the last page, we're
+ * still safe to wait for ordered extent to finish.
+ */
+ if (!(offset == 0 && length == PAGE_SIZE)) {
btrfs_releasepage(page, GFP_NOFS);
return;
}
@@ -8386,89 +8401,123 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
if (!inode_evicting)
lock_extent_bits(tree, page_start, page_end, &cached_state);
- start = page_start;
-again:
- ordered = btrfs_lookup_ordered_range(inode, start, page_end - start + 1);
- if (ordered) {
- found_ordered = true;
- end = min(page_end,
- ordered->file_offset + ordered->num_bytes - 1);
+ cur = page_start;
+ while (cur < page_end) {
+ struct btrfs_ordered_extent *ordered;
+ bool delete_states;
+ u64 range_end;
+ u32 range_len;
+
+ ordered = btrfs_lookup_first_ordered_range(inode, cur,
+ page_end + 1 - cur);
+ if (!ordered) {
+ range_end = page_end;
+ /*
+ * No ordered extent covering this range, we are safe
+ * to delete all extent states in the range.
+ */
+ delete_states = true;
+ goto next;
+ }
+ if (ordered->file_offset > cur) {
+ /*
+ * There is a range between [cur, oe->file_offset) not
+ * covered by any ordered extent.
+ * We are safe to delete all extent states, and handle
+ * the ordered extent in the next iteration.
+ */
+ range_end = ordered->file_offset - 1;
+ delete_states = true;
+ goto next;
+ }
+
+ range_end = min(ordered->file_offset + ordered->num_bytes - 1,
+ page_end);
+ ASSERT(range_end + 1 - cur < U32_MAX);
+ range_len = range_end + 1 - cur;
+ if (!btrfs_page_test_ordered(fs_info, page, cur, range_len)) {
+ /*
+ * If Ordered (Private2) is cleared, it means endio has
+ * already been executed for the range.
+ * We can't delete the extent states as
+ * btrfs_finish_ordered_io() may still use some of them.
+ */
+ delete_states = false;
+ goto next;
+ }
+ btrfs_page_clear_ordered(fs_info, page, cur, range_len);
+
/*
* IO on this page will never be started, so we need to account
* for any ordered extents now. Don't clear EXTENT_DELALLOC_NEW
* here, must leave that up for the ordered extent completion.
+ *
+ * This will also unlock the range for incoming
+ * btrfs_finish_ordered_io().
*/
if (!inode_evicting)
- clear_extent_bit(tree, start, end,
+ clear_extent_bit(tree, cur, range_end,
EXTENT_DELALLOC |
EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
EXTENT_DEFRAG, 1, 0, &cached_state);
+
+ spin_lock_irq(&inode->ordered_tree.lock);
+ set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
+ ordered->truncated_len = min(ordered->truncated_len,
+ cur - ordered->file_offset);
+ spin_unlock_irq(&inode->ordered_tree.lock);
+
+ if (btrfs_dec_test_ordered_pending(inode, &ordered,
+ cur, range_end + 1 - cur, 1)) {
+ btrfs_finish_ordered_io(ordered);
+ /*
+ * The ordered extent has finished, now we're again
+ * safe to delete all extent states of the range.
+ */
+ delete_states = true;
+ } else {
+ /*
+ * btrfs_finish_ordered_io() will get executed by endio
+ * of other pages, thus we can't delete extent states
+ * anymore
+ */
+ delete_states = false;
+ }
+next:
+ if (ordered)
+ btrfs_put_ordered_extent(ordered);
/*
- * whoever cleared the private bit is responsible
- * for the finish_ordered_io
+ * Qgroup reserved space handler
+ * Sector(s) here will be either:
+ *
+ * 1) Already written to disk or bio already finished
+ * Then its QGROUP_RESERVED bit in io_tree is already cleared.
+ * Qgroup will be handled by its qgroup_record then.
+ * btrfs_qgroup_free_data() call will do nothing here.
+ *
+ * 2) Not written to disk yet
+ * Then btrfs_qgroup_free_data() call will clear the
+ * QGROUP_RESERVED bit of its io_tree, and free the qgroup
+ * reserved data space.
+ * Since the IO will never happen for this page.
*/
- if (TestClearPagePrivate2(page)) {
- spin_lock_irq(&inode->ordered_tree.lock);
- set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
- ordered->truncated_len = min(ordered->truncated_len,
- start - ordered->file_offset);
- spin_unlock_irq(&inode->ordered_tree.lock);
-
- if (btrfs_dec_test_ordered_pending(inode, &ordered,
- start,
- end - start + 1, 1)) {
- btrfs_finish_ordered_io(ordered);
- completed_ordered = true;
- }
- }
- btrfs_put_ordered_extent(ordered);
+ btrfs_qgroup_free_data(inode, NULL, cur, range_end + 1 - cur);
if (!inode_evicting) {
- cached_state = NULL;
- lock_extent_bits(tree, start, end,
- &cached_state);
+ clear_extent_bit(tree, cur, range_end, EXTENT_LOCKED |
+ EXTENT_DELALLOC | EXTENT_UPTODATE |
+ EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1,
+ delete_states, &cached_state);
}
-
- start = end + 1;
- if (start < page_end)
- goto again;
+ cur = range_end + 1;
}
-
/*
- * Qgroup reserved space handler
- * Page here will be either
- * 1) Already written to disk or ordered extent already submitted
- * Then its QGROUP_RESERVED bit in io_tree is already cleaned.
- * Qgroup will be handled by its qgroup_record then.
- * btrfs_qgroup_free_data() call will do nothing here.
- *
- * 2) Not written to disk yet
- * Then btrfs_qgroup_free_data() call will clear the QGROUP_RESERVED
- * bit of its io_tree, and free the qgroup reserved data space.
- * Since the IO will never happen for this page.
+ * We have iterated through all ordered extents of the page, the page
+ * should not have Ordered (Private2) anymore, or the above iteration
+ * did something wrong.
*/
- btrfs_qgroup_free_data(inode, NULL, page_start, PAGE_SIZE);
- if (!inode_evicting) {
- bool delete = true;
-
- /*
- * If there's an ordered extent for this range and we have not
- * finished it ourselves, we must leave EXTENT_DELALLOC_NEW set
- * in the range for the ordered extent completion. We must also
- * not delete the range, otherwise we would lose that bit (and
- * any other bits set in the range). Make sure EXTENT_UPTODATE
- * is cleared if we don't delete, otherwise it can lead to
- * corruptions if the i_size is extented later.
- */
- if (found_ordered && !completed_ordered)
- delete = false;
- clear_extent_bit(tree, page_start, page_end, EXTENT_LOCKED |
- EXTENT_DELALLOC | EXTENT_UPTODATE |
- EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1,
- delete, &cached_state);
-
+ ASSERT(!PageOrdered(page));
+ if (!inode_evicting)
__btrfs_releasepage(page, GFP_NOFS);
- }
-
ClearPageChecked(page);
clear_page_extent_mapped(page);
}
@@ -8614,8 +8663,8 @@ again:
flush_dcache_page(page);
}
ClearPageChecked(page);
- set_page_dirty(page);
- SetPageUptodate(page);
+ btrfs_page_set_dirty(fs_info, page, page_start, end + 1 - page_start);
+ btrfs_page_set_uptodate(fs_info, page, page_start, end + 1 - page_start);
btrfs_set_inode_last_sub_trans(BTRFS_I(inode));
@@ -8649,6 +8698,7 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)
struct btrfs_trans_handle *trans;
u64 mask = fs_info->sectorsize - 1;
u64 min_size = btrfs_calc_metadata_size(fs_info, 1);
+ u64 extents_found = 0;
if (!skip_writeback) {
ret = btrfs_wait_ordered_range(inode, inode->i_size & (~mask),
@@ -8706,20 +8756,13 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)
min_size, false);
BUG_ON(ret);
- /*
- * So if we truncate and then write and fsync we normally would just
- * write the extents that changed, which is a problem if we need to
- * first truncate that entire inode. So set this flag so we write out
- * all of the extents in the inode to the sync log so we're completely
- * safe.
- */
- set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
trans->block_rsv = rsv;
while (1) {
ret = btrfs_truncate_inode_items(trans, root, BTRFS_I(inode),
inode->i_size,
- BTRFS_EXTENT_DATA_KEY);
+ BTRFS_EXTENT_DATA_KEY,
+ &extents_found);
trans->block_rsv = &fs_info->trans_block_rsv;
if (ret != -ENOSPC && ret != -EAGAIN)
break;
@@ -8781,6 +8824,22 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)
}
out:
btrfs_free_block_rsv(fs_info, rsv);
+ /*
+ * So if we truncate and then write and fsync we normally would just
+ * write the extents that changed, which is a problem if we need to
+ * first truncate that entire inode. So set this flag so we write out
+ * all of the extents in the inode to the sync log so we're completely
+ * safe.
+ *
+ * If no extents were dropped or trimmed we don't need to force the next
+ * fsync to truncate all the inode's items from the log and re-log them
+ * all. This means the truncate operation did not change the file size,
+ * or changed it to a smaller size but there was only an implicit hole
+ * between the old i_size and the new i_size, and there were no prealloc
+ * extents beyond i_size to drop.
+ */
+ if (extents_found > 0)
+ set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
return ret;
}
@@ -9076,6 +9135,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
int ret2;
bool root_log_pinned = false;
bool dest_log_pinned = false;
+ bool need_abort = false;
/* we only allow rename subvolume link between subvolumes */
if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
@@ -9135,6 +9195,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
old_idx);
if (ret)
goto out_fail;
+ need_abort = true;
}
/* And now for the dest. */
@@ -9150,8 +9211,11 @@ static int btrfs_rename_exchange(struct inode *old_dir,
new_ino,
btrfs_ino(BTRFS_I(old_dir)),
new_idx);
- if (ret)
+ if (ret) {
+ if (need_abort)
+ btrfs_abort_transaction(trans, ret);
goto out_fail;
+ }
}
/* Update inode version and ctime/mtime. */
@@ -10182,17 +10246,21 @@ out:
return ret;
}
-void btrfs_set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
+void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end)
{
- struct inode *inode = tree->private_data;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
unsigned long index = start >> PAGE_SHIFT;
unsigned long end_index = end >> PAGE_SHIFT;
struct page *page;
+ u32 len;
+ ASSERT(end + 1 - start <= U32_MAX);
+ len = end + 1 - start;
while (index <= end_index) {
- page = find_get_page(inode->i_mapping, index);
+ page = find_get_page(inode->vfs_inode.i_mapping, index);
ASSERT(page); /* Pages should be in the extent_io_tree */
- set_page_writeback(page);
+
+ btrfs_page_set_writeback(fs_info, page, start, len);
put_page(page);
index++;
}
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 5dc2fd843ae3..0ba98e08a029 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -353,15 +353,55 @@ update_flags:
return ret;
}
+/*
+ * Start exclusive operation @type, return true on success
+ */
bool btrfs_exclop_start(struct btrfs_fs_info *fs_info,
enum btrfs_exclusive_operation type)
{
- return !cmpxchg(&fs_info->exclusive_operation, BTRFS_EXCLOP_NONE, type);
+ bool ret = false;
+
+ spin_lock(&fs_info->super_lock);
+ if (fs_info->exclusive_operation == BTRFS_EXCLOP_NONE) {
+ fs_info->exclusive_operation = type;
+ ret = true;
+ }
+ spin_unlock(&fs_info->super_lock);
+
+ return ret;
+}
+
+/*
+ * Conditionally allow to enter the exclusive operation in case it's compatible
+ * with the running one. This must be paired with btrfs_exclop_start_unlock and
+ * btrfs_exclop_finish.
+ *
+ * Compatibility:
+ * - the same type is already running
+ * - not BTRFS_EXCLOP_NONE - this is intentionally incompatible and the caller
+ * must check the condition first that would allow none -> @type
+ */
+bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info,
+ enum btrfs_exclusive_operation type)
+{
+ spin_lock(&fs_info->super_lock);
+ if (fs_info->exclusive_operation == type)
+ return true;
+
+ spin_unlock(&fs_info->super_lock);
+ return false;
+}
+
+void btrfs_exclop_start_unlock(struct btrfs_fs_info *fs_info)
+{
+ spin_unlock(&fs_info->super_lock);
}
void btrfs_exclop_finish(struct btrfs_fs_info *fs_info)
{
+ spin_lock(&fs_info->super_lock);
WRITE_ONCE(fs_info->exclusive_operation, BTRFS_EXCLOP_NONE);
+ spin_unlock(&fs_info->super_lock);
sysfs_notify(&fs_info->fs_devices->fsid_kobj, NULL, "exclusive_operation");
}
@@ -1455,7 +1495,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
if (btrfs_defrag_cancelled(fs_info)) {
btrfs_debug(fs_info, "defrag_file cancelled");
ret = -EAGAIN;
- break;
+ goto error;
}
if (!should_defrag_range(inode, (u64)i << PAGE_SHIFT,
@@ -1533,6 +1573,8 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
}
}
+ ret = defrag_count;
+error:
if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO)) {
filemap_flush(inode->i_mapping);
if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
@@ -1546,8 +1588,6 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD);
}
- ret = defrag_count;
-
out_ra:
if (do_compress) {
btrfs_inode_lock(inode, 0);
@@ -1560,6 +1600,48 @@ out_ra:
return ret;
}
+/*
+ * Try to start exclusive operation @type or cancel it if it's running.
+ *
+ * Return:
+ * 0 - normal mode, newly claimed op started
+ * >0 - normal mode, something else is running,
+ * return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS to user space
+ * ECANCELED - cancel mode, successful cancel
+ * ENOTCONN - cancel mode, operation not running anymore
+ */
+static int exclop_start_or_cancel_reloc(struct btrfs_fs_info *fs_info,
+ enum btrfs_exclusive_operation type, bool cancel)
+{
+ if (!cancel) {
+ /* Start normal op */
+ if (!btrfs_exclop_start(fs_info, type))
+ return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
+ /* Exclusive operation is now claimed */
+ return 0;
+ }
+
+ /* Cancel running op */
+ if (btrfs_exclop_start_try_lock(fs_info, type)) {
+ /*
+ * This blocks any exclop finish from setting it to NONE, so we
+ * request cancellation. Either it runs and we will wait for it,
+ * or it has finished and no waiting will happen.
+ */
+ atomic_inc(&fs_info->reloc_cancel_req);
+ btrfs_exclop_start_unlock(fs_info);
+
+ if (test_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags))
+ wait_on_bit(&fs_info->flags, BTRFS_FS_RELOC_RUNNING,
+ TASK_INTERRUPTIBLE);
+
+ return -ECANCELED;
+ }
+
+ /* Something else is running or none */
+ return -ENOTCONN;
+}
+
static noinline int btrfs_ioctl_resize(struct file *file,
void __user *arg)
{
@@ -1577,6 +1659,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
char *devstr = NULL;
int ret = 0;
int mod = 0;
+ bool cancel;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -1585,20 +1668,23 @@ static noinline int btrfs_ioctl_resize(struct file *file,
if (ret)
return ret;
- if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_RESIZE)) {
- mnt_drop_write_file(file);
- return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
- }
-
+ /*
+ * Read the arguments before checking exclusivity to be able to
+ * distinguish regular resize and cancel
+ */
vol_args = memdup_user(arg, sizeof(*vol_args));
if (IS_ERR(vol_args)) {
ret = PTR_ERR(vol_args);
- goto out;
+ goto out_drop;
}
-
vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
-
sizestr = vol_args->name;
+ cancel = (strcmp("cancel", sizestr) == 0);
+ ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_RESIZE, cancel);
+ if (ret)
+ goto out_free;
+ /* Exclusive operation is now claimed */
+
devstr = strchr(sizestr, ':');
if (devstr) {
sizestr = devstr + 1;
@@ -1606,10 +1692,10 @@ static noinline int btrfs_ioctl_resize(struct file *file,
devstr = vol_args->name;
ret = kstrtoull(devstr, 10, &devid);
if (ret)
- goto out_free;
+ goto out_finish;
if (!devid) {
ret = -EINVAL;
- goto out_free;
+ goto out_finish;
}
btrfs_info(fs_info, "resizing devid %llu", devid);
}
@@ -1619,7 +1705,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
btrfs_info(fs_info, "resizer unable to find device %llu",
devid);
ret = -ENODEV;
- goto out_free;
+ goto out_finish;
}
if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
@@ -1627,7 +1713,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
"resizer unable to apply on readonly device %llu",
devid);
ret = -EPERM;
- goto out_free;
+ goto out_finish;
}
if (!strcmp(sizestr, "max"))
@@ -1643,13 +1729,13 @@ static noinline int btrfs_ioctl_resize(struct file *file,
new_size = memparse(sizestr, &retptr);
if (*retptr != '\0' || new_size == 0) {
ret = -EINVAL;
- goto out_free;
+ goto out_finish;
}
}
if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
ret = -EPERM;
- goto out_free;
+ goto out_finish;
}
old_size = btrfs_device_get_total_bytes(device);
@@ -1657,24 +1743,24 @@ static noinline int btrfs_ioctl_resize(struct file *file,
if (mod < 0) {
if (new_size > old_size) {
ret = -EINVAL;
- goto out_free;
+ goto out_finish;
}
new_size = old_size - new_size;
} else if (mod > 0) {
if (new_size > ULLONG_MAX - old_size) {
ret = -ERANGE;
- goto out_free;
+ goto out_finish;
}
new_size = old_size + new_size;
}
if (new_size < SZ_256M) {
ret = -EINVAL;
- goto out_free;
+ goto out_finish;
}
if (new_size > device->bdev->bd_inode->i_size) {
ret = -EFBIG;
- goto out_free;
+ goto out_finish;
}
new_size = round_down(new_size, fs_info->sectorsize);
@@ -1683,7 +1769,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
- goto out_free;
+ goto out_finish;
}
ret = btrfs_grow_device(trans, device, new_size);
btrfs_commit_transaction(trans);
@@ -1696,10 +1782,11 @@ static noinline int btrfs_ioctl_resize(struct file *file,
"resize device %s (devid %llu) from %llu to %llu",
rcu_str_deref(device->name), device->devid,
old_size, new_size);
+out_finish:
+ btrfs_exclop_finish(fs_info);
out_free:
kfree(vol_args);
-out:
- btrfs_exclop_finish(fs_info);
+out_drop:
mnt_drop_write_file(file);
return ret;
}
@@ -2897,7 +2984,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
err = PTR_ERR(subvol_name_ptr);
goto free_parent;
}
- /* subvol_name_ptr is already NULL termined */
+ /* subvol_name_ptr is already nul terminated */
subvol_name = (char *)kbasename(subvol_name_ptr);
}
} else {
@@ -3119,6 +3206,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_ioctl_vol_args_v2 *vol_args;
int ret;
+ bool cancel = false;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -3137,18 +3225,22 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
ret = -EOPNOTSUPP;
goto out;
}
+ vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
+ if (!(vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) &&
+ strcmp("cancel", vol_args->name) == 0)
+ cancel = true;
- if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REMOVE)) {
- ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
+ ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_DEV_REMOVE,
+ cancel);
+ if (ret)
goto out;
- }
+ /* Exclusive operation is now claimed */
- if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) {
+ if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID)
ret = btrfs_rm_device(fs_info, NULL, vol_args->devid);
- } else {
- vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
+ else
ret = btrfs_rm_device(fs_info, vol_args->name, 0);
- }
+
btrfs_exclop_finish(fs_info);
if (!ret) {
@@ -3172,6 +3264,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_ioctl_vol_args *vol_args;
int ret;
+ bool cancel;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -3180,25 +3273,24 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
if (ret)
return ret;
- if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REMOVE)) {
- ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
- goto out_drop_write;
- }
-
vol_args = memdup_user(arg, sizeof(*vol_args));
if (IS_ERR(vol_args)) {
ret = PTR_ERR(vol_args);
- goto out;
+ goto out_drop_write;
}
-
vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
- ret = btrfs_rm_device(fs_info, vol_args->name, 0);
+ cancel = (strcmp("cancel", vol_args->name) == 0);
+
+ ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_DEV_REMOVE,
+ cancel);
+ if (ret == 0) {
+ ret = btrfs_rm_device(fs_info, vol_args->name, 0);
+ if (!ret)
+ btrfs_info(fs_info, "disk deleted %s", vol_args->name);
+ btrfs_exclop_finish(fs_info);
+ }
- if (!ret)
- btrfs_info(fs_info, "disk deleted %s", vol_args->name);
kfree(vol_args);
-out:
- btrfs_exclop_finish(fs_info);
out_drop_write:
mnt_drop_write_file(file);
@@ -3551,7 +3643,7 @@ static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root,
goto out;
}
transid = trans->transid;
- ret = btrfs_commit_transaction_async(trans, 0);
+ ret = btrfs_commit_transaction_async(trans);
if (ret) {
btrfs_end_transaction(trans);
return ret;
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 5fafc5e89bb7..313d9d685adb 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -57,7 +57,7 @@ void btrfs_tree_read_lock(struct extent_buffer *eb)
/*
* Try-lock for read.
*
- * Retrun 1 if the rwlock has been taken, 0 otherwise
+ * Return 1 if the rwlock has been taken, 0 otherwise
*/
int btrfs_try_tree_read_lock(struct extent_buffer *eb)
{
@@ -72,7 +72,7 @@ int btrfs_try_tree_read_lock(struct extent_buffer *eb)
/*
* Try-lock for write.
*
- * Retrun 1 if the rwlock has been taken, 0 otherwise
+ * Return 1 if the rwlock has been taken, 0 otherwise
*/
int btrfs_try_tree_write_lock(struct extent_buffer *eb)
{
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 6c413bb451a3..6eb41b7c0c84 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -16,6 +16,7 @@
#include "compression.h"
#include "delalloc-space.h"
#include "qgroup.h"
+#include "subpage.h"
static struct kmem_cache *btrfs_ordered_extent_cache;
@@ -300,81 +301,142 @@ void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry,
}
/*
- * Finish IO for one ordered extent across a given range. The range can
- * contain several ordered extents.
+ * Mark all ordered extents io inside the specified range finished.
*
- * @found_ret: Return the finished ordered extent
- * @file_offset: File offset for the finished IO
- * Will also be updated to one byte past the range that is
- * recordered as finished. This allows caller to walk forward.
- * @io_size: Length of the finish IO range
- * @uptodate: If the IO finished without problem
- *
- * Return true if any ordered extent is finished in the range, and update
- * @found_ret and @file_offset.
- * Return false otherwise.
+ * @page: The invovled page for the opeartion.
+ * For uncompressed buffered IO, the page status also needs to be
+ * updated to indicate whether the pending ordered io is finished.
+ * Can be NULL for direct IO and compressed write.
+ * For these cases, callers are ensured they won't execute the
+ * endio function twice.
+ * @finish_func: The function to be executed when all the IO of an ordered
+ * extent are finished.
*
- * NOTE: Although The range can cross multiple ordered extents, only one
- * ordered extent will be updated during one call. The caller is responsible to
- * iterate all ordered extents in the range.
+ * This function is called for endio, thus the range must have ordered
+ * extent(s) coveri it.
*/
-bool btrfs_dec_test_first_ordered_pending(struct btrfs_inode *inode,
- struct btrfs_ordered_extent **finished_ret,
- u64 *file_offset, u64 io_size, int uptodate)
+void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode,
+ struct page *page, u64 file_offset,
+ u64 num_bytes, btrfs_func_t finish_func,
+ bool uptodate)
{
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct btrfs_workqueue *wq;
struct rb_node *node;
struct btrfs_ordered_extent *entry = NULL;
- bool finished = false;
unsigned long flags;
- u64 dec_end;
- u64 dec_start;
- u64 to_dec;
+ u64 cur = file_offset;
+
+ if (btrfs_is_free_space_inode(inode))
+ wq = fs_info->endio_freespace_worker;
+ else
+ wq = fs_info->endio_write_workers;
+
+ if (page)
+ ASSERT(page->mapping && page_offset(page) <= file_offset &&
+ file_offset + num_bytes <= page_offset(page) + PAGE_SIZE);
spin_lock_irqsave(&tree->lock, flags);
- node = tree_search(tree, *file_offset);
- if (!node)
- goto out;
+ while (cur < file_offset + num_bytes) {
+ u64 entry_end;
+ u64 end;
+ u32 len;
- entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
- if (!in_range(*file_offset, entry->file_offset, entry->num_bytes))
- goto out;
+ node = tree_search(tree, cur);
+ /* No ordered extents at all */
+ if (!node)
+ break;
- dec_start = max(*file_offset, entry->file_offset);
- dec_end = min(*file_offset + io_size,
- entry->file_offset + entry->num_bytes);
- *file_offset = dec_end;
- if (dec_start > dec_end) {
- btrfs_crit(fs_info, "bad ordering dec_start %llu end %llu",
- dec_start, dec_end);
- }
- to_dec = dec_end - dec_start;
- if (to_dec > entry->bytes_left) {
- btrfs_crit(fs_info,
- "bad ordered accounting left %llu size %llu",
- entry->bytes_left, to_dec);
- }
- entry->bytes_left -= to_dec;
- if (!uptodate)
- set_bit(BTRFS_ORDERED_IOERR, &entry->flags);
+ entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+ entry_end = entry->file_offset + entry->num_bytes;
+ /*
+ * |<-- OE --->| |
+ * cur
+ * Go to next OE.
+ */
+ if (cur >= entry_end) {
+ node = rb_next(node);
+ /* No more ordered extents, exit */
+ if (!node)
+ break;
+ entry = rb_entry(node, struct btrfs_ordered_extent,
+ rb_node);
+
+ /* Go to next ordered extent and continue */
+ cur = entry->file_offset;
+ continue;
+ }
+ /*
+ * | |<--- OE --->|
+ * cur
+ * Go to the start of OE.
+ */
+ if (cur < entry->file_offset) {
+ cur = entry->file_offset;
+ continue;
+ }
- if (entry->bytes_left == 0) {
/*
- * Ensure only one caller can set the flag and finished_ret
- * accordingly
+ * Now we are definitely inside one ordered extent.
+ *
+ * |<--- OE --->|
+ * |
+ * cur
*/
- finished = !test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
- /* test_and_set_bit implies a barrier */
- cond_wake_up_nomb(&entry->wait);
- }
-out:
- if (finished && finished_ret && entry) {
- *finished_ret = entry;
- refcount_inc(&entry->refs);
+ end = min(entry->file_offset + entry->num_bytes,
+ file_offset + num_bytes) - 1;
+ ASSERT(end + 1 - cur < U32_MAX);
+ len = end + 1 - cur;
+
+ if (page) {
+ /*
+ * Ordered (Private2) bit indicates whether we still
+ * have pending io unfinished for the ordered extent.
+ *
+ * If there's no such bit, we need to skip to next range.
+ */
+ if (!btrfs_page_test_ordered(fs_info, page, cur, len)) {
+ cur += len;
+ continue;
+ }
+ btrfs_page_clear_ordered(fs_info, page, cur, len);
+ }
+
+ /* Now we're fine to update the accounting */
+ if (unlikely(len > entry->bytes_left)) {
+ WARN_ON(1);
+ btrfs_crit(fs_info,
+"bad ordered extent accounting, root=%llu ino=%llu OE offset=%llu OE len=%llu to_dec=%u left=%llu",
+ inode->root->root_key.objectid,
+ btrfs_ino(inode),
+ entry->file_offset,
+ entry->num_bytes,
+ len, entry->bytes_left);
+ entry->bytes_left = 0;
+ } else {
+ entry->bytes_left -= len;
+ }
+
+ if (!uptodate)
+ set_bit(BTRFS_ORDERED_IOERR, &entry->flags);
+
+ /*
+ * All the IO of the ordered extent is finished, we need to queue
+ * the finish_func to be executed.
+ */
+ if (entry->bytes_left == 0) {
+ set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
+ cond_wake_up(&entry->wait);
+ refcount_inc(&entry->refs);
+ spin_unlock_irqrestore(&tree->lock, flags);
+ btrfs_init_work(&entry->work, finish_func, NULL, NULL);
+ btrfs_queue_work(wq, &entry->work);
+ spin_lock_irqsave(&tree->lock, flags);
+ }
+ cur += len;
}
spin_unlock_irqrestore(&tree->lock, flags);
- return finished;
}
/*
@@ -870,6 +932,81 @@ out:
}
/*
+ * Lookup the first ordered extent that overlaps the range
+ * [@file_offset, @file_offset + @len).
+ *
+ * The difference between this and btrfs_lookup_first_ordered_extent() is
+ * that this one won't return any ordered extent that does not overlap the range.
+ * And the difference against btrfs_lookup_ordered_extent() is, this function
+ * ensures the first ordered extent gets returned.
+ */
+struct btrfs_ordered_extent *btrfs_lookup_first_ordered_range(
+ struct btrfs_inode *inode, u64 file_offset, u64 len)
+{
+ struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
+ struct rb_node *node;
+ struct rb_node *cur;
+ struct rb_node *prev;
+ struct rb_node *next;
+ struct btrfs_ordered_extent *entry = NULL;
+
+ spin_lock_irq(&tree->lock);
+ node = tree->tree.rb_node;
+ /*
+ * Here we don't want to use tree_search() which will use tree->last
+ * and screw up the search order.
+ * And __tree_search() can't return the adjacent ordered extents
+ * either, thus here we do our own search.
+ */
+ while (node) {
+ entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+
+ if (file_offset < entry->file_offset) {
+ node = node->rb_left;
+ } else if (file_offset >= entry_end(entry)) {
+ node = node->rb_right;
+ } else {
+ /*
+ * Direct hit, got an ordered extent that starts at
+ * @file_offset
+ */
+ goto out;
+ }
+ }
+ if (!entry) {
+ /* Empty tree */
+ goto out;
+ }
+
+ cur = &entry->rb_node;
+ /* We got an entry around @file_offset, check adjacent entries */
+ if (entry->file_offset < file_offset) {
+ prev = cur;
+ next = rb_next(cur);
+ } else {
+ prev = rb_prev(cur);
+ next = cur;
+ }
+ if (prev) {
+ entry = rb_entry(prev, struct btrfs_ordered_extent, rb_node);
+ if (range_overlaps(entry, file_offset, len))
+ goto out;
+ }
+ if (next) {
+ entry = rb_entry(next, struct btrfs_ordered_extent, rb_node);
+ if (range_overlaps(entry, file_offset, len))
+ goto out;
+ }
+ /* No ordered extent in the range */
+ entry = NULL;
+out:
+ if (entry)
+ refcount_inc(&entry->refs);
+ spin_unlock_irq(&tree->lock);
+ return entry;
+}
+
+/*
* btrfs_flush_ordered_range - Lock the passed range and ensures all pending
* ordered extents in it are run to completion.
*
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index e60c07f36427..566472004edd 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -172,13 +172,13 @@ btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t)
void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry);
void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode,
struct btrfs_ordered_extent *entry);
+void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode,
+ struct page *page, u64 file_offset,
+ u64 num_bytes, btrfs_func_t finish_func,
+ bool uptodate);
bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode,
struct btrfs_ordered_extent **cached,
u64 file_offset, u64 io_size, int uptodate);
-bool btrfs_dec_test_first_ordered_pending(struct btrfs_inode *inode,
- struct btrfs_ordered_extent **finished_ret,
- u64 *file_offset, u64 io_size,
- int uptodate);
int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
u64 disk_bytenr, u64 num_bytes, u64 disk_num_bytes,
int type);
@@ -196,6 +196,8 @@ void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait);
int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len);
struct btrfs_ordered_extent *
btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset);
+struct btrfs_ordered_extent *btrfs_lookup_first_ordered_range(
+ struct btrfs_inode *inode, u64 file_offset, u64 len);
struct btrfs_ordered_extent *btrfs_lookup_ordered_range(
struct btrfs_inode *inode,
u64 file_offset,
diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
index 2dcb1cb21634..b1cb5a8c2999 100644
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -260,6 +260,10 @@ static int prop_compression_validate(const char *value, size_t len)
if (btrfs_compress_is_valid_type(value, len))
return 0;
+ if ((len == 2 && strncmp("no", value, 2) == 0) ||
+ (len == 4 && strncmp("none", value, 4) == 0))
+ return 0;
+
return -EINVAL;
}
@@ -269,7 +273,17 @@ static int prop_compression_apply(struct inode *inode, const char *value,
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
int type;
+ /* Reset to defaults */
if (len == 0) {
+ BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS;
+ BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS;
+ BTRFS_I(inode)->prop_compress = BTRFS_COMPRESS_NONE;
+ return 0;
+ }
+
+ /* Set NOCOMPRESS flag */
+ if ((len == 2 && strncmp("no", value, 2) == 0) ||
+ (len == 4 && strncmp("none", value, 4) == 0)) {
BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS;
BTRFS_I(inode)->prop_compress = BTRFS_COMPRESS_NONE;
@@ -348,7 +362,7 @@ static int inherit_props(struct btrfs_trans_handle *trans,
/*
* This is not strictly necessary as the property should be
- * valid, but in case it isn't, don't propagate it futher.
+ * valid, but in case it isn't, don't propagate it further.
*/
ret = h->validate(value, strlen(value));
if (ret)
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 3ded812f522c..07ec06d4e972 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2521,7 +2521,7 @@ int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr,
int ret = 0;
/*
- * If quotas get disabled meanwhile, the resouces need to be freed and
+ * If quotas get disabled meanwhile, the resources need to be freed and
* we can't just exit here.
*/
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
@@ -3545,13 +3545,7 @@ static int try_flush_qgroup(struct btrfs_root *root)
struct btrfs_trans_handle *trans;
int ret;
- /*
- * Can't hold an open transaction or we run the risk of deadlocking,
- * and can't either be under the context of a send operation (where
- * current->journal_info is set to BTRFS_SEND_TRANS_STUB), as that
- * would result in a crash when starting a transaction and does not
- * make sense either (send is a read-only operation).
- */
+ /* Can't hold an open transaction or we run the risk of deadlocking. */
ASSERT(current->journal_info == NULL);
if (WARN_ON(current->journal_info))
return 0;
diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c
index 3928ecc40d7b..9b0814318e72 100644
--- a/fs/btrfs/reflink.c
+++ b/fs/btrfs/reflink.c
@@ -7,6 +7,7 @@
#include "delalloc-space.h"
#include "reflink.h"
#include "transaction.h"
+#include "subpage.h"
#define BTRFS_MAX_DEDUPE_LEN SZ_16M
@@ -52,7 +53,8 @@ static int copy_inline_to_page(struct btrfs_inode *inode,
const u64 datal,
const u8 comp_type)
{
- const u64 block_size = btrfs_inode_sectorsize(inode);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ const u32 block_size = fs_info->sectorsize;
const u64 range_end = file_offset + block_size - 1;
const size_t inline_size = size - btrfs_file_extent_calc_inline_size(0);
char *data_start = inline_data + btrfs_file_extent_calc_inline_size(0);
@@ -106,10 +108,12 @@ static int copy_inline_to_page(struct btrfs_inode *inode,
set_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &inode->runtime_flags);
if (comp_type == BTRFS_COMPRESS_NONE) {
- memcpy_to_page(page, 0, data_start, datal);
+ memcpy_to_page(page, offset_in_page(file_offset), data_start,
+ datal);
flush_dcache_page(page);
} else {
- ret = btrfs_decompress(comp_type, data_start, page, 0,
+ ret = btrfs_decompress(comp_type, data_start, page,
+ offset_in_page(file_offset),
inline_size, datal);
if (ret)
goto out_unlock;
@@ -133,9 +137,9 @@ static int copy_inline_to_page(struct btrfs_inode *inode,
flush_dcache_page(page);
}
- SetPageUptodate(page);
+ btrfs_page_set_uptodate(fs_info, page, file_offset, block_size);
ClearPageChecked(page);
- set_page_dirty(page);
+ btrfs_page_set_dirty(fs_info, page, file_offset, block_size);
out_unlock:
if (page) {
unlock_page(page);
@@ -203,10 +207,7 @@ static int clone_copy_inline_extent(struct inode *dst,
* inline extent's data to the page.
*/
ASSERT(key.offset > 0);
- ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset,
- inline_data, size, datal,
- comp_type);
- goto out;
+ goto copy_to_page;
}
} else if (i_size_read(dst) <= datal) {
struct btrfs_file_extent_item *ei;
@@ -222,13 +223,10 @@ static int clone_copy_inline_extent(struct inode *dst,
BTRFS_FILE_EXTENT_INLINE)
goto copy_inline_extent;
- ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset,
- inline_data, size, datal, comp_type);
- goto out;
+ goto copy_to_page;
}
copy_inline_extent:
- ret = 0;
/*
* We have no extent items, or we have an extent at offset 0 which may
* or may not be inlined. All these cases are dealt the same way.
@@ -240,11 +238,13 @@ copy_inline_extent:
* clone. Deal with all these cases by copying the inline extent
* data into the respective page at the destination inode.
*/
- ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset,
- inline_data, size, datal, comp_type);
- goto out;
+ goto copy_to_page;
}
+ /*
+ * Release path before starting a new transaction so we don't hold locks
+ * that would confuse lockdep.
+ */
btrfs_release_path(path);
/*
* If we end up here it means were copy the inline extent into a leaf
@@ -301,6 +301,21 @@ out:
*trans_out = trans;
return ret;
+
+copy_to_page:
+ /*
+ * Release our path because we don't need it anymore and also because
+ * copy_inline_to_page() needs to reserve data and metadata, which may
+ * need to flush delalloc when we are low on available space and
+ * therefore cause a deadlock if writeback of an inline extent needs to
+ * write to the same leaf or an ordered extent completion needs to write
+ * to the same leaf.
+ */
+ btrfs_release_path(path);
+
+ ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset,
+ inline_data, size, datal, comp_type);
+ goto out;
}
/**
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index b70be2ac2e9e..fc831597cb22 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2876,11 +2876,12 @@ int setup_extent_mapping(struct inode *inode, u64 start, u64 end,
}
/*
- * Allow error injection to test balance cancellation
+ * Allow error injection to test balance/relocation cancellation
*/
noinline int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info)
{
return atomic_read(&fs_info->balance_cancel_req) ||
+ atomic_read(&fs_info->reloc_cancel_req) ||
fatal_signal_pending(current);
}
ALLOW_ERROR_INJECTION(btrfs_should_cancel_balance, TRUE);
@@ -3780,6 +3781,60 @@ out:
return inode;
}
+/*
+ * Mark start of chunk relocation that is cancellable. Check if the cancellation
+ * has been requested meanwhile and don't start in that case.
+ *
+ * Return:
+ * 0 success
+ * -EINPROGRESS operation is already in progress, that's probably a bug
+ * -ECANCELED cancellation request was set before the operation started
+ * -EAGAIN can not start because there are ongoing send operations
+ */
+static int reloc_chunk_start(struct btrfs_fs_info *fs_info)
+{
+ spin_lock(&fs_info->send_reloc_lock);
+ if (fs_info->send_in_progress) {
+ btrfs_warn_rl(fs_info,
+"cannot run relocation while send operations are in progress (%d in progress)",
+ fs_info->send_in_progress);
+ spin_unlock(&fs_info->send_reloc_lock);
+ return -EAGAIN;
+ }
+ if (test_and_set_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags)) {
+ /* This should not happen */
+ spin_unlock(&fs_info->send_reloc_lock);
+ btrfs_err(fs_info, "reloc already running, cannot start");
+ return -EINPROGRESS;
+ }
+ spin_unlock(&fs_info->send_reloc_lock);
+
+ if (atomic_read(&fs_info->reloc_cancel_req) > 0) {
+ btrfs_info(fs_info, "chunk relocation canceled on start");
+ /*
+ * On cancel, clear all requests but let the caller mark
+ * the end after cleanup operations.
+ */
+ atomic_set(&fs_info->reloc_cancel_req, 0);
+ return -ECANCELED;
+ }
+ return 0;
+}
+
+/*
+ * Mark end of chunk relocation that is cancellable and wake any waiters.
+ */
+static void reloc_chunk_end(struct btrfs_fs_info *fs_info)
+{
+ /* Requested after start, clear bit first so any waiters can continue */
+ if (atomic_read(&fs_info->reloc_cancel_req) > 0)
+ btrfs_info(fs_info, "chunk relocation canceled during operation");
+ spin_lock(&fs_info->send_reloc_lock);
+ clear_and_wake_up_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags);
+ spin_unlock(&fs_info->send_reloc_lock);
+ atomic_set(&fs_info->reloc_cancel_req, 0);
+}
+
static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info)
{
struct reloc_control *rc;
@@ -3862,6 +3917,12 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
return -ENOMEM;
}
+ ret = reloc_chunk_start(fs_info);
+ if (ret < 0) {
+ err = ret;
+ goto out_put_bg;
+ }
+
rc->extent_root = extent_root;
rc->block_group = bg;
@@ -3952,7 +4013,9 @@ out:
if (err && rw)
btrfs_dec_block_group_ro(rc->block_group);
iput(rc->data_inode);
- btrfs_put_block_group(rc->block_group);
+out_put_bg:
+ btrfs_put_block_group(bg);
+ reloc_chunk_end(fs_info);
free_reloc_control(rc);
return err;
}
@@ -4073,6 +4136,12 @@ int btrfs_recover_relocation(struct btrfs_root *root)
goto out;
}
+ ret = reloc_chunk_start(fs_info);
+ if (ret < 0) {
+ err = ret;
+ goto out_end;
+ }
+
rc->extent_root = fs_info->extent_root;
set_reloc_control(rc);
@@ -4137,6 +4206,8 @@ out_clean:
err = ret;
out_unset:
unset_reloc_control(rc);
+out_end:
+ reloc_chunk_end(fs_info);
free_reloc_control(rc);
out:
free_reloc_roots(&reloc_roots);
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 485cda3eb8d7..088641ba7a8e 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -165,6 +165,10 @@ struct scrub_ctx {
int readonly;
int pages_per_rd_bio;
+ /* State of IO submission throttling affecting the associated device */
+ ktime_t throttle_deadline;
+ u64 throttle_sent;
+
int is_dev_replace;
u64 write_pointer;
@@ -605,6 +609,7 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
spin_lock_init(&sctx->list_lock);
spin_lock_init(&sctx->stat_lock);
init_waitqueue_head(&sctx->list_wait);
+ sctx->throttle_deadline = 0;
WARN_ON(sctx->wr_curr_bio != NULL);
mutex_init(&sctx->wr_lock);
@@ -626,7 +631,6 @@ nomem:
static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
void *warn_ctx)
{
- u64 isize;
u32 nlink;
int ret;
int i;
@@ -662,7 +666,6 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
eb = swarn->path->nodes[0];
inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
struct btrfs_inode_item);
- isize = btrfs_inode_size(eb, inode_item);
nlink = btrfs_inode_nlink(eb, inode_item);
btrfs_release_path(swarn->path);
@@ -691,12 +694,12 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
*/
for (i = 0; i < ipath->fspath->elem_cnt; ++i)
btrfs_warn_in_rcu(fs_info,
-"%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %llu, links %u (path: %s)",
+"%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %u, links %u (path: %s)",
swarn->errstr, swarn->logical,
rcu_str_deref(swarn->dev->name),
swarn->physical,
root, inum, offset,
- min(isize - offset, (u64)PAGE_SIZE), nlink,
+ fs_info->sectorsize, nlink,
(char *)(unsigned long)ipath->fspath->val[i]);
btrfs_put_root(local_root);
@@ -885,25 +888,25 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
* read all mirrors one after the other. This includes to
* re-read the extent or metadata block that failed (that was
* the cause that this fixup code is called) another time,
- * page by page this time in order to know which pages
+ * sector by sector this time in order to know which sectors
* caused I/O errors and which ones are good (for all mirrors).
* It is the goal to handle the situation when more than one
* mirror contains I/O errors, but the errors do not
* overlap, i.e. the data can be repaired by selecting the
- * pages from those mirrors without I/O error on the
- * particular pages. One example (with blocks >= 2 * PAGE_SIZE)
- * would be that mirror #1 has an I/O error on the first page,
- * the second page is good, and mirror #2 has an I/O error on
- * the second page, but the first page is good.
- * Then the first page of the first mirror can be repaired by
- * taking the first page of the second mirror, and the
- * second page of the second mirror can be repaired by
- * copying the contents of the 2nd page of the 1st mirror.
- * One more note: if the pages of one mirror contain I/O
+ * sectors from those mirrors without I/O error on the
+ * particular sectors. One example (with blocks >= 2 * sectorsize)
+ * would be that mirror #1 has an I/O error on the first sector,
+ * the second sector is good, and mirror #2 has an I/O error on
+ * the second sector, but the first sector is good.
+ * Then the first sector of the first mirror can be repaired by
+ * taking the first sector of the second mirror, and the
+ * second sector of the second mirror can be repaired by
+ * copying the contents of the 2nd sector of the 1st mirror.
+ * One more note: if the sectors of one mirror contain I/O
* errors, the checksum cannot be verified. In order to get
* the best data for repairing, the first attempt is to find
* a mirror without I/O errors and with a validated checksum.
- * Only if this is not possible, the pages are picked from
+ * Only if this is not possible, the sectors are picked from
* mirrors with I/O errors without considering the checksum.
* If the latter is the case, at the end, the checksum of the
* repaired area is verified in order to correctly maintain
@@ -1060,26 +1063,26 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
/*
* In case of I/O errors in the area that is supposed to be
- * repaired, continue by picking good copies of those pages.
- * Select the good pages from mirrors to rewrite bad pages from
+ * repaired, continue by picking good copies of those sectors.
+ * Select the good sectors from mirrors to rewrite bad sectors from
* the area to fix. Afterwards verify the checksum of the block
* that is supposed to be repaired. This verification step is
* only done for the purpose of statistic counting and for the
* final scrub report, whether errors remain.
* A perfect algorithm could make use of the checksum and try
- * all possible combinations of pages from the different mirrors
+ * all possible combinations of sectors from the different mirrors
* until the checksum verification succeeds. For example, when
- * the 2nd page of mirror #1 faces I/O errors, and the 2nd page
+ * the 2nd sector of mirror #1 faces I/O errors, and the 2nd sector
* of mirror #2 is readable but the final checksum test fails,
- * then the 2nd page of mirror #3 could be tried, whether now
+ * then the 2nd sector of mirror #3 could be tried, whether now
* the final checksum succeeds. But this would be a rare
* exception and is therefore not implemented. At least it is
* avoided that the good copy is overwritten.
* A more useful improvement would be to pick the sectors
* without I/O error based on sector sizes (512 bytes on legacy
- * disks) instead of on PAGE_SIZE. Then maybe 512 byte of one
+ * disks) instead of on sectorsize. Then maybe 512 byte of one
* mirror could be repaired by taking 512 byte of a different
- * mirror, even if other 512 byte sectors in the same PAGE_SIZE
+ * mirror, even if other 512 byte sectors in the same sectorsize
* area are unreadable.
*/
success = 1;
@@ -1260,7 +1263,7 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
{
struct scrub_ctx *sctx = original_sblock->sctx;
struct btrfs_fs_info *fs_info = sctx->fs_info;
- u64 length = original_sblock->page_count * PAGE_SIZE;
+ u64 length = original_sblock->page_count * fs_info->sectorsize;
u64 logical = original_sblock->pagev[0]->logical;
u64 generation = original_sblock->pagev[0]->generation;
u64 flags = original_sblock->pagev[0]->flags;
@@ -1283,13 +1286,13 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
*/
while (length > 0) {
- sublen = min_t(u64, length, PAGE_SIZE);
+ sublen = min_t(u64, length, fs_info->sectorsize);
mapped_length = sublen;
bbio = NULL;
/*
- * with a length of PAGE_SIZE, each returned stripe
- * represents one mirror
+ * With a length of sectorsize, each returned stripe represents
+ * one mirror
*/
btrfs_bio_counter_inc_blocked(fs_info);
ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
@@ -1480,7 +1483,7 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
bio = btrfs_io_bio_alloc(1);
bio_set_dev(bio, spage->dev->bdev);
- bio_add_page(bio, spage->page, PAGE_SIZE, 0);
+ bio_add_page(bio, spage->page, fs_info->sectorsize, 0);
bio->bi_iter.bi_sector = spage->physical >> 9;
bio->bi_opf = REQ_OP_READ;
@@ -1544,6 +1547,7 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
struct scrub_page *spage_bad = sblock_bad->pagev[page_num];
struct scrub_page *spage_good = sblock_good->pagev[page_num];
struct btrfs_fs_info *fs_info = sblock_bad->sctx->fs_info;
+ const u32 sectorsize = fs_info->sectorsize;
BUG_ON(spage_bad->page == NULL);
BUG_ON(spage_good->page == NULL);
@@ -1563,8 +1567,8 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
bio->bi_iter.bi_sector = spage_bad->physical >> 9;
bio->bi_opf = REQ_OP_WRITE;
- ret = bio_add_page(bio, spage_good->page, PAGE_SIZE, 0);
- if (PAGE_SIZE != ret) {
+ ret = bio_add_page(bio, spage_good->page, sectorsize, 0);
+ if (ret != sectorsize) {
bio_put(bio);
return -EIO;
}
@@ -1642,6 +1646,7 @@ static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
{
struct scrub_bio *sbio;
int ret;
+ const u32 sectorsize = sctx->fs_info->sectorsize;
mutex_lock(&sctx->wr_lock);
again:
@@ -1681,16 +1686,16 @@ again:
bio->bi_iter.bi_sector = sbio->physical >> 9;
bio->bi_opf = REQ_OP_WRITE;
sbio->status = 0;
- } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
+ } else if (sbio->physical + sbio->page_count * sectorsize !=
spage->physical_for_dev_replace ||
- sbio->logical + sbio->page_count * PAGE_SIZE !=
+ sbio->logical + sbio->page_count * sectorsize !=
spage->logical) {
scrub_wr_submit(sctx);
goto again;
}
- ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
- if (ret != PAGE_SIZE) {
+ ret = bio_add_page(sbio->bio, spage->page, sectorsize, 0);
+ if (ret != sectorsize) {
if (sbio->page_count < 1) {
bio_put(sbio->bio);
sbio->bio = NULL;
@@ -1729,7 +1734,8 @@ static void scrub_wr_submit(struct scrub_ctx *sctx)
btrfsic_submit_bio(sbio->bio);
if (btrfs_is_zoned(sctx->fs_info))
- sctx->write_pointer = sbio->physical + sbio->page_count * PAGE_SIZE;
+ sctx->write_pointer = sbio->physical + sbio->page_count *
+ sctx->fs_info->sectorsize;
}
static void scrub_wr_bio_end_io(struct bio *bio)
@@ -1988,6 +1994,65 @@ static void scrub_page_put(struct scrub_page *spage)
}
}
+/*
+ * Throttling of IO submission, bandwidth-limit based, the timeslice is 1
+ * second. Limit can be set via /sys/fs/UUID/devinfo/devid/scrub_speed_max.
+ */
+static void scrub_throttle(struct scrub_ctx *sctx)
+{
+ const int time_slice = 1000;
+ struct scrub_bio *sbio;
+ struct btrfs_device *device;
+ s64 delta;
+ ktime_t now;
+ u32 div;
+ u64 bwlimit;
+
+ sbio = sctx->bios[sctx->curr];
+ device = sbio->dev;
+ bwlimit = READ_ONCE(device->scrub_speed_max);
+ if (bwlimit == 0)
+ return;
+
+ /*
+ * Slice is divided into intervals when the IO is submitted, adjust by
+ * bwlimit and maximum of 64 intervals.
+ */
+ div = max_t(u32, 1, (u32)(bwlimit / (16 * 1024 * 1024)));
+ div = min_t(u32, 64, div);
+
+ /* Start new epoch, set deadline */
+ now = ktime_get();
+ if (sctx->throttle_deadline == 0) {
+ sctx->throttle_deadline = ktime_add_ms(now, time_slice / div);
+ sctx->throttle_sent = 0;
+ }
+
+ /* Still in the time to send? */
+ if (ktime_before(now, sctx->throttle_deadline)) {
+ /* If current bio is within the limit, send it */
+ sctx->throttle_sent += sbio->bio->bi_iter.bi_size;
+ if (sctx->throttle_sent <= div_u64(bwlimit, div))
+ return;
+
+ /* We're over the limit, sleep until the rest of the slice */
+ delta = ktime_ms_delta(sctx->throttle_deadline, now);
+ } else {
+ /* New request after deadline, start new epoch */
+ delta = 0;
+ }
+
+ if (delta) {
+ long timeout;
+
+ timeout = div_u64(delta * HZ, 1000);
+ schedule_timeout_interruptible(timeout);
+ }
+
+ /* Next call will start the deadline period */
+ sctx->throttle_deadline = 0;
+}
+
static void scrub_submit(struct scrub_ctx *sctx)
{
struct scrub_bio *sbio;
@@ -1995,6 +2060,8 @@ static void scrub_submit(struct scrub_ctx *sctx)
if (sctx->curr == -1)
return;
+ scrub_throttle(sctx);
+
sbio = sctx->bios[sctx->curr];
sctx->curr = -1;
scrub_pending_bio_inc(sctx);
@@ -2006,6 +2073,7 @@ static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
{
struct scrub_block *sblock = spage->sblock;
struct scrub_bio *sbio;
+ const u32 sectorsize = sctx->fs_info->sectorsize;
int ret;
again:
@@ -2044,9 +2112,9 @@ again:
bio->bi_iter.bi_sector = sbio->physical >> 9;
bio->bi_opf = REQ_OP_READ;
sbio->status = 0;
- } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
+ } else if (sbio->physical + sbio->page_count * sectorsize !=
spage->physical ||
- sbio->logical + sbio->page_count * PAGE_SIZE !=
+ sbio->logical + sbio->page_count * sectorsize !=
spage->logical ||
sbio->dev != spage->dev) {
scrub_submit(sctx);
@@ -2054,8 +2122,8 @@ again:
}
sbio->pagev[sbio->page_count] = spage;
- ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
- if (ret != PAGE_SIZE) {
+ ret = bio_add_page(sbio->bio, spage->page, sectorsize, 0);
+ if (ret != sectorsize) {
if (sbio->page_count < 1) {
bio_put(sbio->bio);
sbio->bio = NULL;
@@ -2398,7 +2466,7 @@ static void scrub_block_complete(struct scrub_block *sblock)
if (sblock->sparity && corrupted && !sblock->data_corrected) {
u64 start = sblock->pagev[0]->logical;
u64 end = sblock->pagev[sblock->page_count - 1]->logical +
- PAGE_SIZE;
+ sblock->sctx->fs_info->sectorsize;
ASSERT(end - start <= U32_MAX);
scrub_parity_mark_sectors_error(sblock->sparity,
@@ -2418,7 +2486,7 @@ static void drop_csum_range(struct scrub_ctx *sctx, struct btrfs_ordered_sum *su
* the csum into @csum.
*
* The search source is sctx->csum_list, which is a pre-populated list
- * storing bytenr ordered csum ranges. We're reponsible to cleanup any range
+ * storing bytenr ordered csum ranges. We're responsible to cleanup any range
* that is before @logical.
*
* Return 0 if there is no csum for the range.
@@ -3138,28 +3206,23 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
physical = map->stripes[num].physical;
offset = 0;
nstripes = div64_u64(length, map->stripe_len);
+ mirror_num = 1;
+ increment = map->stripe_len;
if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
offset = map->stripe_len * num;
increment = map->stripe_len * map->num_stripes;
- mirror_num = 1;
} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
int factor = map->num_stripes / map->sub_stripes;
offset = map->stripe_len * (num / map->sub_stripes);
increment = map->stripe_len * factor;
mirror_num = num % map->sub_stripes + 1;
} else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) {
- increment = map->stripe_len;
mirror_num = num % map->num_stripes + 1;
} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
- increment = map->stripe_len;
mirror_num = num % map->num_stripes + 1;
} else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
get_raid56_logic_offset(physical, num, map, &offset, NULL);
increment = map->stripe_len * nr_data_stripes(map);
- mirror_num = 1;
- } else {
- increment = map->stripe_len;
- mirror_num = 1;
}
path = btrfs_alloc_path();
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index bd69db72acc5..6ac37ae6c811 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -2078,16 +2078,6 @@ static struct name_cache_entry *name_cache_search(struct send_ctx *sctx,
}
/*
- * Removes the entry from the list and adds it back to the end. This marks the
- * entry as recently used so that name_cache_clean_unused does not remove it.
- */
-static void name_cache_used(struct send_ctx *sctx, struct name_cache_entry *nce)
-{
- list_del(&nce->list);
- list_add_tail(&nce->list, &sctx->name_cache_list);
-}
-
-/*
* Remove some entries from the beginning of name_cache_list.
*/
static void name_cache_clean_unused(struct send_ctx *sctx)
@@ -2147,7 +2137,13 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
kfree(nce);
nce = NULL;
} else {
- name_cache_used(sctx, nce);
+ /*
+ * Removes the entry from the list and adds it back to
+ * the end. This marks the entry as recently used so
+ * that name_cache_clean_unused does not remove it.
+ */
+ list_move_tail(&nce->list, &sctx->name_cache_list);
+
*parent_ino = nce->parent_ino;
*parent_gen = nce->parent_gen;
ret = fs_path_add(dest, nce->name, nce->name_len);
@@ -4064,6 +4060,17 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
if (ret < 0)
goto out;
} else {
+ /*
+ * If we previously orphanized a directory that
+ * collided with a new reference that we already
+ * processed, recompute the current path because
+ * that directory may be part of the path.
+ */
+ if (orphanized_dir) {
+ ret = refresh_ref_path(sctx, cur);
+ if (ret < 0)
+ goto out;
+ }
ret = send_unlink(sctx, cur->full_path);
if (ret < 0)
goto out;
@@ -6507,7 +6514,7 @@ static int changed_extent(struct send_ctx *sctx,
* updates the inode item, but it only changes the iversion (sequence
* field in the inode item) of the inode, so if a file is deduplicated
* the same amount of times in both the parent and send snapshots, its
- * iversion becames the same in both snapshots, whence the inode item is
+ * iversion becomes the same in both snapshots, whence the inode item is
* the same on both snapshots.
*/
if (sctx->cur_ino != sctx->cmp_key->objectid)
@@ -7409,23 +7416,21 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg)
if (ret)
goto out;
- mutex_lock(&fs_info->balance_mutex);
- if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
- mutex_unlock(&fs_info->balance_mutex);
+ spin_lock(&fs_info->send_reloc_lock);
+ if (test_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags)) {
+ spin_unlock(&fs_info->send_reloc_lock);
btrfs_warn_rl(fs_info,
- "cannot run send because a balance operation is in progress");
+ "cannot run send because a relocation operation is in progress");
ret = -EAGAIN;
goto out;
}
fs_info->send_in_progress++;
- mutex_unlock(&fs_info->balance_mutex);
+ spin_unlock(&fs_info->send_reloc_lock);
- current->journal_info = BTRFS_SEND_TRANS_STUB;
ret = send_subvol(sctx);
- current->journal_info = NULL;
- mutex_lock(&fs_info->balance_mutex);
+ spin_lock(&fs_info->send_reloc_lock);
fs_info->send_in_progress--;
- mutex_unlock(&fs_info->balance_mutex);
+ spin_unlock(&fs_info->send_reloc_lock);
if (ret < 0)
goto out;
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index 2dc674b7c3b1..f79bf85f2439 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -133,18 +133,13 @@
* operations, however they won't be usable until the transaction commits.
*
* COMMIT_TRANS
- * may_commit_transaction() is the ultimate arbiter on whether we commit the
- * transaction or not. In order to avoid constantly churning we do all the
- * above flushing first and then commit the transaction as the last resort.
- * However we need to take into account things like pinned space that would
- * be freed, plus any delayed work we may not have gotten rid of in the case
- * of metadata.
- *
- * FORCE_COMMIT_TRANS
- * For use by the preemptive flusher. We use this to bypass the ticketing
- * checks in may_commit_transaction, as we have more information about the
- * overall state of the system and may want to commit the transaction ahead
- * of actual ENOSPC conditions.
+ * This will commit the transaction. Historically we had a lot of logic
+ * surrounding whether or not we'd commit the transaction, but this waits born
+ * out of a pre-tickets era where we could end up committing the transaction
+ * thousands of times in a row without making progress. Now thanks to our
+ * ticketing system we know if we're not making progress and can error
+ * everybody out after a few commits rather than burning the disk hoping for
+ * a different answer.
*
* OVERCOMMIT
*
@@ -197,13 +192,6 @@ static int create_space_info(struct btrfs_fs_info *info, u64 flags)
if (!space_info)
return -ENOMEM;
- ret = percpu_counter_init(&space_info->total_bytes_pinned, 0,
- GFP_KERNEL);
- if (ret) {
- kfree(space_info);
- return ret;
- }
-
for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
INIT_LIST_HEAD(&space_info->block_groups[i]);
init_rwsem(&space_info->groups_sem);
@@ -389,7 +377,7 @@ again:
ticket = list_first_entry(head, struct reserve_ticket, list);
- /* Check and see if our ticket can be satisified now. */
+ /* Check and see if our ticket can be satisfied now. */
if ((used + ticket->bytes <= space_info->total_bytes) ||
btrfs_can_overcommit(fs_info, space_info, ticket->bytes,
flush)) {
@@ -495,7 +483,8 @@ static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
*/
static void shrink_delalloc(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info,
- u64 to_reclaim, bool wait_ordered)
+ u64 to_reclaim, bool wait_ordered,
+ bool for_preempt)
{
struct btrfs_trans_handle *trans;
u64 delalloc_bytes;
@@ -532,7 +521,7 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info,
* ordered extents, otherwise we'll waste time trying to flush delalloc
* that likely won't give us the space back we need.
*/
- if (ordered_bytes > delalloc_bytes)
+ if (ordered_bytes > delalloc_bytes && !for_preempt)
wait_ordered = true;
loops = 0;
@@ -551,6 +540,14 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info,
break;
}
+ /*
+ * If we are for preemption we just want a one-shot of delalloc
+ * flushing so we can stop flushing if we decide we don't need
+ * to anymore.
+ */
+ if (for_preempt)
+ break;
+
spin_lock(&space_info->lock);
if (list_empty(&space_info->tickets) &&
list_empty(&space_info->priority_tickets)) {
@@ -566,109 +563,6 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info,
}
}
-/**
- * Possibly commit the transaction if its ok to
- *
- * @fs_info: the filesystem
- * @space_info: space_info we are checking for commit, either data or metadata
- *
- * This will check to make sure that committing the transaction will actually
- * get us somewhere and then commit the transaction if it does. Otherwise it
- * will return -ENOSPC.
- */
-static int may_commit_transaction(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info)
-{
- struct reserve_ticket *ticket = NULL;
- struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv;
- struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
- struct btrfs_block_rsv *trans_rsv = &fs_info->trans_block_rsv;
- struct btrfs_trans_handle *trans;
- u64 reclaim_bytes = 0;
- u64 bytes_needed = 0;
- u64 cur_free_bytes = 0;
-
- trans = (struct btrfs_trans_handle *)current->journal_info;
- if (trans)
- return -EAGAIN;
-
- spin_lock(&space_info->lock);
- cur_free_bytes = btrfs_space_info_used(space_info, true);
- if (cur_free_bytes < space_info->total_bytes)
- cur_free_bytes = space_info->total_bytes - cur_free_bytes;
- else
- cur_free_bytes = 0;
-
- if (!list_empty(&space_info->priority_tickets))
- ticket = list_first_entry(&space_info->priority_tickets,
- struct reserve_ticket, list);
- else if (!list_empty(&space_info->tickets))
- ticket = list_first_entry(&space_info->tickets,
- struct reserve_ticket, list);
- if (ticket)
- bytes_needed = ticket->bytes;
-
- if (bytes_needed > cur_free_bytes)
- bytes_needed -= cur_free_bytes;
- else
- bytes_needed = 0;
- spin_unlock(&space_info->lock);
-
- if (!bytes_needed)
- return 0;
-
- trans = btrfs_join_transaction(fs_info->extent_root);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
-
- /*
- * See if there is enough pinned space to make this reservation, or if
- * we have block groups that are going to be freed, allowing us to
- * possibly do a chunk allocation the next loop through.
- */
- if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags) ||
- __percpu_counter_compare(&space_info->total_bytes_pinned,
- bytes_needed,
- BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0)
- goto commit;
-
- /*
- * See if there is some space in the delayed insertion reserve for this
- * reservation. If the space_info's don't match (like for DATA or
- * SYSTEM) then just go enospc, reclaiming this space won't recover any
- * space to satisfy those reservations.
- */
- if (space_info != delayed_rsv->space_info)
- goto enospc;
-
- spin_lock(&delayed_rsv->lock);
- reclaim_bytes += delayed_rsv->reserved;
- spin_unlock(&delayed_rsv->lock);
-
- spin_lock(&delayed_refs_rsv->lock);
- reclaim_bytes += delayed_refs_rsv->reserved;
- spin_unlock(&delayed_refs_rsv->lock);
-
- spin_lock(&trans_rsv->lock);
- reclaim_bytes += trans_rsv->reserved;
- spin_unlock(&trans_rsv->lock);
-
- if (reclaim_bytes >= bytes_needed)
- goto commit;
- bytes_needed -= reclaim_bytes;
-
- if (__percpu_counter_compare(&space_info->total_bytes_pinned,
- bytes_needed,
- BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0)
- goto enospc;
-
-commit:
- return btrfs_commit_transaction(trans);
-enospc:
- btrfs_end_transaction(trans);
- return -ENOSPC;
-}
-
/*
* Try to flush some data based on policy set by @state. This is only advisory
* and may fail for various reasons. The caller is supposed to examine the
@@ -702,7 +596,7 @@ static void flush_space(struct btrfs_fs_info *fs_info,
case FLUSH_DELALLOC:
case FLUSH_DELALLOC_WAIT:
shrink_delalloc(fs_info, space_info, num_bytes,
- state == FLUSH_DELALLOC_WAIT);
+ state == FLUSH_DELALLOC_WAIT, for_preempt);
break;
case FLUSH_DELAYED_REFS_NR:
case FLUSH_DELAYED_REFS:
@@ -743,9 +637,7 @@ static void flush_space(struct btrfs_fs_info *fs_info,
btrfs_wait_on_delayed_iputs(fs_info);
break;
case COMMIT_TRANS:
- ret = may_commit_transaction(fs_info, space_info);
- break;
- case FORCE_COMMIT_TRANS:
+ ASSERT(current->journal_info == NULL);
trans = btrfs_join_transaction(root);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
@@ -792,12 +684,14 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info)
{
+ u64 global_rsv_size = fs_info->global_block_rsv.reserved;
u64 ordered, delalloc;
u64 thresh = div_factor_fine(space_info->total_bytes, 98);
u64 used;
/* If we're just plain full then async reclaim just slows us down. */
- if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh)
+ if ((space_info->bytes_used + space_info->bytes_reserved +
+ global_rsv_size) >= thresh)
return false;
/*
@@ -838,8 +732,10 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info,
thresh = calc_available_free_space(fs_info, space_info,
BTRFS_RESERVE_FLUSH_ALL);
- thresh += (space_info->total_bytes - space_info->bytes_used -
- space_info->bytes_reserved - space_info->bytes_readonly);
+ used = space_info->bytes_used + space_info->bytes_reserved +
+ space_info->bytes_readonly + global_rsv_size;
+ if (used < space_info->total_bytes)
+ thresh += space_info->total_bytes - used;
thresh >>= space_info->clamp;
used = space_info->bytes_pinned;
@@ -860,14 +756,20 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info,
* clearly be heavy enough to warrant preemptive flushing. In the case
* of heavy DIO or ordered reservations, preemptive flushing will just
* waste time and cause us to slow down.
+ *
+ * We want to make sure we truly are maxed out on ordered however, so
+ * cut ordered in half, and if it's still higher than delalloc then we
+ * can keep flushing. This is to avoid the case where we start
+ * flushing, and now delalloc == ordered and we stop preemptively
+ * flushing when we could still have several gigs of delalloc to flush.
*/
- ordered = percpu_counter_read_positive(&fs_info->ordered_bytes);
+ ordered = percpu_counter_read_positive(&fs_info->ordered_bytes) >> 1;
delalloc = percpu_counter_read_positive(&fs_info->delalloc_bytes);
if (ordered >= delalloc)
used += fs_info->delayed_refs_rsv.reserved +
fs_info->delayed_block_rsv.reserved;
else
- used += space_info->bytes_may_use;
+ used += space_info->bytes_may_use - global_rsv_size;
return (used >= thresh && !btrfs_fs_closing(fs_info) &&
!test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
@@ -921,7 +823,6 @@ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info,
{
struct reserve_ticket *ticket;
u64 tickets_id = space_info->tickets_id;
- u64 first_ticket_bytes = 0;
if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
btrfs_info(fs_info, "cannot satisfy tickets, dumping space info");
@@ -937,21 +838,6 @@ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info,
steal_from_global_rsv(fs_info, space_info, ticket))
return true;
- /*
- * may_commit_transaction will avoid committing the transaction
- * if it doesn't feel like the space reclaimed by the commit
- * would result in the ticket succeeding. However if we have a
- * smaller ticket in the queue it may be small enough to be
- * satisified by committing the transaction, so if any
- * subsequent ticket is smaller than the first ticket go ahead
- * and send us back for another loop through the enospc flushing
- * code.
- */
- if (first_ticket_bytes == 0)
- first_ticket_bytes = ticket->bytes;
- else if (first_ticket_bytes > ticket->bytes)
- return true;
-
if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
btrfs_info(fs_info, "failing ticket with %llu bytes",
ticket->bytes);
@@ -1117,7 +1003,7 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work)
(delayed_block_rsv->reserved +
delayed_refs_rsv->reserved)) {
to_reclaim = space_info->bytes_pinned;
- flush = FORCE_COMMIT_TRANS;
+ flush = COMMIT_TRANS;
} else if (delayed_block_rsv->reserved >
delayed_refs_rsv->reserved) {
to_reclaim = delayed_block_rsv->reserved;
@@ -1171,28 +1057,9 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work)
* immediately re-usable, it comes in the form of a delayed ref, which must be
* run and then the transaction must be committed.
*
- * FLUSH_DELAYED_REFS
- * The above two cases generate delayed refs that will affect
- * ->total_bytes_pinned. However this counter can be inconsistent with
- * reality if there are outstanding delayed refs. This is because we adjust
- * the counter based solely on the current set of delayed refs and disregard
- * any on-disk state which might include more refs. So for example, if we
- * have an extent with 2 references, but we only drop 1, we'll see that there
- * is a negative delayed ref count for the extent and assume that the space
- * will be freed, and thus increase ->total_bytes_pinned.
- *
- * Running the delayed refs gives us the actual real view of what will be
- * freed at the transaction commit time. This stage will not actually free
- * space for us, it just makes sure that may_commit_transaction() has all of
- * the information it needs to make the right decision.
- *
* COMMIT_TRANS
- * This is where we reclaim all of the pinned space generated by the previous
- * two stages. We will not commit the transaction if we don't think we're
- * likely to satisfy our request, which means if our current free space +
- * total_bytes_pinned < reservation we will not commit. This is why the
- * previous states are actually important, to make sure we know for sure
- * whether committing the transaction will allow us to make progress.
+ * This is where we reclaim all of the pinned space generated by running the
+ * iputs
*
* ALLOC_CHUNK_FORCE
* For data we start with alloc chunk force, however we could have been full
@@ -1202,7 +1069,6 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work)
static const enum btrfs_flush_state data_flush_states[] = {
FLUSH_DELALLOC_WAIT,
RUN_DELAYED_IPUTS,
- FLUSH_DELAYED_REFS,
COMMIT_TRANS,
ALLOC_CHUNK_FORCE,
};
@@ -1561,6 +1427,15 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
flush == BTRFS_RESERVE_FLUSH_DATA) {
list_add_tail(&ticket.list, &space_info->tickets);
if (!space_info->flush) {
+ /*
+ * We were forced to add a reserve ticket, so
+ * our preemptive flushing is unable to keep
+ * up. Clamp down on the threshold for the
+ * preemptive flushing in order to keep up with
+ * the workload.
+ */
+ maybe_clamp_preempt(fs_info, space_info);
+
space_info->flush = 1;
trace_btrfs_trigger_flush(fs_info,
space_info->flags,
@@ -1572,14 +1447,6 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
list_add_tail(&ticket.list,
&space_info->priority_tickets);
}
-
- /*
- * We were forced to add a reserve ticket, so our preemptive
- * flushing is unable to keep up. Clamp down on the threshold
- * for the preemptive flushing in order to keep up with the
- * workload.
- */
- maybe_clamp_preempt(fs_info, space_info);
} else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
used += orig_bytes;
/*
@@ -1588,8 +1455,8 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
* the async reclaim as we will panic.
*/
if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) &&
- need_preemptive_reclaim(fs_info, space_info) &&
- !work_busy(&fs_info->preempt_reclaim_work)) {
+ !work_busy(&fs_info->preempt_reclaim_work) &&
+ need_preemptive_reclaim(fs_info, space_info)) {
trace_btrfs_trigger_flush(fs_info, space_info->flags,
orig_bytes, flush, "preempt");
queue_work(system_unbound_wq,
diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h
index b1a8ffb03b3e..cb5056472e79 100644
--- a/fs/btrfs/space-info.h
+++ b/fs/btrfs/space-info.h
@@ -43,18 +43,6 @@ struct btrfs_space_info {
u64 flags;
- /*
- * bytes_pinned is kept in line with what is actually pinned, as in
- * we've called update_block_group and dropped the bytes_used counter
- * and increased the bytes_pinned counter. However this means that
- * bytes_pinned does not reflect the bytes that will be pinned once the
- * delayed refs are flushed, so this counter is inc'ed every time we
- * call btrfs_free_extent so it is a realtime count of what will be
- * freed once the transaction is committed. It will be zeroed every
- * time the transaction commits.
- */
- struct percpu_counter total_bytes_pinned;
-
struct list_head list;
/* Protected by the spinlock 'lock'. */
struct list_head ro_bgs;
@@ -157,22 +145,4 @@ static inline void btrfs_space_info_free_bytes_may_use(
}
int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes,
enum btrfs_reserve_flush_enum flush);
-
-static inline void __btrfs_mod_total_bytes_pinned(
- struct btrfs_space_info *space_info,
- s64 mod)
-{
- percpu_counter_add_batch(&space_info->total_bytes_pinned, mod,
- BTRFS_TOTAL_BYTES_PINNED_BATCH);
-}
-
-static inline void btrfs_mod_total_bytes_pinned(struct btrfs_fs_info *fs_info,
- u64 flags, s64 mod)
-{
- struct btrfs_space_info *space_info = btrfs_find_space_info(fs_info, flags);
-
- ASSERT(space_info);
- __btrfs_mod_total_bytes_pinned(space_info, mod);
-}
-
#endif /* BTRFS_SPACE_INFO_H */
diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c
index 2d19089ab625..640bcd21bf28 100644
--- a/fs/btrfs/subpage.c
+++ b/fs/btrfs/subpage.c
@@ -3,6 +3,7 @@
#include <linux/slab.h>
#include "ctree.h"
#include "subpage.h"
+#include "btrfs_inode.h"
/*
* Subpage (sectorsize < PAGE_SIZE) support overview:
@@ -110,10 +111,12 @@ int btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info,
if (!*ret)
return -ENOMEM;
spin_lock_init(&(*ret)->lock);
- if (type == BTRFS_SUBPAGE_METADATA)
+ if (type == BTRFS_SUBPAGE_METADATA) {
atomic_set(&(*ret)->eb_refs, 0);
- else
+ } else {
atomic_set(&(*ret)->readers, 0);
+ atomic_set(&(*ret)->writers, 0);
+ }
return 0;
}
@@ -183,12 +186,10 @@ void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info,
{
struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
const int nbits = len >> fs_info->sectorsize_bits;
- int ret;
btrfs_subpage_assert(fs_info, page, start, len);
- ret = atomic_add_return(nbits, &subpage->readers);
- ASSERT(ret == nbits);
+ atomic_add(nbits, &subpage->readers);
}
void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info,
@@ -196,10 +197,95 @@ void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info,
{
struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
const int nbits = len >> fs_info->sectorsize_bits;
+ bool is_data;
+ bool last;
btrfs_subpage_assert(fs_info, page, start, len);
+ is_data = is_data_inode(page->mapping->host);
ASSERT(atomic_read(&subpage->readers) >= nbits);
- if (atomic_sub_and_test(nbits, &subpage->readers))
+ last = atomic_sub_and_test(nbits, &subpage->readers);
+
+ /*
+ * For data we need to unlock the page if the last read has finished.
+ *
+ * And please don't replace @last with atomic_sub_and_test() call
+ * inside if () condition.
+ * As we want the atomic_sub_and_test() to be always executed.
+ */
+ if (is_data && last)
+ unlock_page(page);
+}
+
+static void btrfs_subpage_clamp_range(struct page *page, u64 *start, u32 *len)
+{
+ u64 orig_start = *start;
+ u32 orig_len = *len;
+
+ *start = max_t(u64, page_offset(page), orig_start);
+ *len = min_t(u64, page_offset(page) + PAGE_SIZE,
+ orig_start + orig_len) - *start;
+}
+
+void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len)
+{
+ struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ const int nbits = (len >> fs_info->sectorsize_bits);
+ int ret;
+
+ btrfs_subpage_assert(fs_info, page, start, len);
+
+ ASSERT(atomic_read(&subpage->readers) == 0);
+ ret = atomic_add_return(nbits, &subpage->writers);
+ ASSERT(ret == nbits);
+}
+
+bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len)
+{
+ struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ const int nbits = (len >> fs_info->sectorsize_bits);
+
+ btrfs_subpage_assert(fs_info, page, start, len);
+
+ ASSERT(atomic_read(&subpage->writers) >= nbits);
+ return atomic_sub_and_test(nbits, &subpage->writers);
+}
+
+/*
+ * Lock a page for delalloc page writeback.
+ *
+ * Return -EAGAIN if the page is not properly initialized.
+ * Return 0 with the page locked, and writer counter updated.
+ *
+ * Even with 0 returned, the page still need extra check to make sure
+ * it's really the correct page, as the caller is using
+ * find_get_pages_contig(), which can race with page invalidating.
+ */
+int btrfs_page_start_writer_lock(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len)
+{
+ if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) {
+ lock_page(page);
+ return 0;
+ }
+ lock_page(page);
+ if (!PagePrivate(page) || !page->private) {
+ unlock_page(page);
+ return -EAGAIN;
+ }
+ btrfs_subpage_clamp_range(page, &start, &len);
+ btrfs_subpage_start_writer(fs_info, page, start, len);
+ return 0;
+}
+
+void btrfs_page_end_writer_lock(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len)
+{
+ if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE)
+ return unlock_page(page);
+ btrfs_subpage_clamp_range(page, &start, &len);
+ if (btrfs_subpage_end_and_test_writer(fs_info, page, start, len))
unlock_page(page);
}
@@ -354,6 +440,32 @@ void btrfs_subpage_clear_writeback(const struct btrfs_fs_info *fs_info,
spin_unlock_irqrestore(&subpage->lock, flags);
}
+void btrfs_subpage_set_ordered(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len)
+{
+ struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
+ unsigned long flags;
+
+ spin_lock_irqsave(&subpage->lock, flags);
+ subpage->ordered_bitmap |= tmp;
+ SetPageOrdered(page);
+ spin_unlock_irqrestore(&subpage->lock, flags);
+}
+
+void btrfs_subpage_clear_ordered(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len)
+{
+ struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
+ unsigned long flags;
+
+ spin_lock_irqsave(&subpage->lock, flags);
+ subpage->ordered_bitmap &= ~tmp;
+ if (subpage->ordered_bitmap == 0)
+ ClearPageOrdered(page);
+ spin_unlock_irqrestore(&subpage->lock, flags);
+}
/*
* Unlike set/clear which is dependent on each page status, for test all bits
* are tested in the same way.
@@ -376,6 +488,7 @@ IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(uptodate);
IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(error);
IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(dirty);
IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(writeback);
+IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(ordered);
/*
* Note that, in selftests (extent-io-tests), we can have empty fs_info passed
@@ -408,6 +521,34 @@ bool btrfs_page_test_##name(const struct btrfs_fs_info *fs_info, \
if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) \
return test_page_func(page); \
return btrfs_subpage_test_##name(fs_info, page, start, len); \
+} \
+void btrfs_page_clamp_set_##name(const struct btrfs_fs_info *fs_info, \
+ struct page *page, u64 start, u32 len) \
+{ \
+ if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \
+ set_page_func(page); \
+ return; \
+ } \
+ btrfs_subpage_clamp_range(page, &start, &len); \
+ btrfs_subpage_set_##name(fs_info, page, start, len); \
+} \
+void btrfs_page_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \
+ struct page *page, u64 start, u32 len) \
+{ \
+ if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \
+ clear_page_func(page); \
+ return; \
+ } \
+ btrfs_subpage_clamp_range(page, &start, &len); \
+ btrfs_subpage_clear_##name(fs_info, page, start, len); \
+} \
+bool btrfs_page_clamp_test_##name(const struct btrfs_fs_info *fs_info, \
+ struct page *page, u64 start, u32 len) \
+{ \
+ if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) \
+ return test_page_func(page); \
+ btrfs_subpage_clamp_range(page, &start, &len); \
+ return btrfs_subpage_test_##name(fs_info, page, start, len); \
}
IMPLEMENT_BTRFS_PAGE_OPS(uptodate, SetPageUptodate, ClearPageUptodate,
PageUptodate);
@@ -416,3 +557,5 @@ IMPLEMENT_BTRFS_PAGE_OPS(dirty, set_page_dirty, clear_page_dirty_for_io,
PageDirty);
IMPLEMENT_BTRFS_PAGE_OPS(writeback, set_page_writeback, end_page_writeback,
PageWriteback);
+IMPLEMENT_BTRFS_PAGE_OPS(ordered, SetPageOrdered, ClearPageOrdered,
+ PageOrdered);
diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h
index bfd626e955be..4d7aca85d915 100644
--- a/fs/btrfs/subpage.h
+++ b/fs/btrfs/subpage.h
@@ -22,6 +22,14 @@ struct btrfs_subpage {
u16 error_bitmap;
u16 dirty_bitmap;
u16 writeback_bitmap;
+ /*
+ * Both data and metadata needs to track how many readers are for the
+ * page.
+ * Data relies on @readers to unlock the page when last reader finished.
+ * While metadata doesn't need page unlock, it needs to prevent
+ * page::private get cleared before the last end_page_read().
+ */
+ atomic_t readers;
union {
/*
* Structures only used by metadata
@@ -32,7 +40,10 @@ struct btrfs_subpage {
atomic_t eb_refs;
/* Structures only used by data */
struct {
- atomic_t readers;
+ atomic_t writers;
+
+ /* Tracke pending ordered extent in this sector */
+ u16 ordered_bitmap;
};
};
};
@@ -63,6 +74,15 @@ void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info,
void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info,
struct page *page, u64 start, u32 len);
+void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len);
+bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len);
+int btrfs_page_start_writer_lock(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len);
+void btrfs_page_end_writer_lock(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len);
+
/*
* Template for subpage related operations.
*
@@ -72,6 +92,10 @@ void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info,
* btrfs_page_*() are for call sites where the page can either be subpage
* specific or regular page. The function will handle both cases.
* But the range still needs to be inside the page.
+ *
+ * btrfs_page_clamp_*() are similar to btrfs_page_*(), except the range doesn't
+ * need to be inside the page. Those functions will truncate the range
+ * automatically.
*/
#define DECLARE_BTRFS_SUBPAGE_OPS(name) \
void btrfs_subpage_set_##name(const struct btrfs_fs_info *fs_info, \
@@ -85,12 +109,19 @@ void btrfs_page_set_##name(const struct btrfs_fs_info *fs_info, \
void btrfs_page_clear_##name(const struct btrfs_fs_info *fs_info, \
struct page *page, u64 start, u32 len); \
bool btrfs_page_test_##name(const struct btrfs_fs_info *fs_info, \
+ struct page *page, u64 start, u32 len); \
+void btrfs_page_clamp_set_##name(const struct btrfs_fs_info *fs_info, \
+ struct page *page, u64 start, u32 len); \
+void btrfs_page_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \
+ struct page *page, u64 start, u32 len); \
+bool btrfs_page_clamp_test_##name(const struct btrfs_fs_info *fs_info, \
struct page *page, u64 start, u32 len);
DECLARE_BTRFS_SUBPAGE_OPS(uptodate);
DECLARE_BTRFS_SUBPAGE_OPS(error);
DECLARE_BTRFS_SUBPAGE_OPS(dirty);
DECLARE_BTRFS_SUBPAGE_OPS(writeback);
+DECLARE_BTRFS_SUBPAGE_OPS(ordered);
bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info,
struct page *page, u64 start, u32 len);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 4a396c1147f1..d07b18b2b250 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -299,17 +299,6 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info = trans->fs_info;
WRITE_ONCE(trans->aborted, errno);
- /* Nothing used. The other threads that have joined this
- * transaction may be able to continue. */
- if (!trans->dirty && list_empty(&trans->new_bgs)) {
- const char *errstr;
-
- errstr = btrfs_decode_error(errno);
- btrfs_warn(fs_info,
- "%s:%d: Aborting unused transaction(%s).",
- function, line, errstr);
- return;
- }
WRITE_ONCE(trans->transaction->aborted, errno);
/* Wake up anybody who may be waiting on this transaction */
wake_up(&fs_info->transaction_wait);
@@ -945,8 +934,7 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
case Opt_check_integrity_including_extent_data:
btrfs_info(info,
"enabling check integrity including extent data");
- btrfs_set_opt(info->mount_opt,
- CHECK_INTEGRITY_INCLUDING_EXTENT_DATA);
+ btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY_DATA);
btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY);
break;
case Opt_check_integrity:
@@ -1527,7 +1515,7 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
if (btrfs_test_opt(info, SKIP_BALANCE))
seq_puts(seq, ",skip_balance");
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
- if (btrfs_test_opt(info, CHECK_INTEGRITY_INCLUDING_EXTENT_DATA))
+ if (btrfs_test_opt(info, CHECK_INTEGRITY_DATA))
seq_puts(seq, ",check_int_data");
else if (btrfs_test_opt(info, CHECK_INTEGRITY))
seq_puts(seq, ",check_int");
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 436ac7b4b334..9d1d140118ff 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -429,7 +429,7 @@ static ssize_t btrfs_discard_bitmap_bytes_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
- return scnprintf(buf, PAGE_SIZE, "%lld\n",
+ return scnprintf(buf, PAGE_SIZE, "%llu\n",
fs_info->discard_ctl.discard_bitmap_bytes);
}
BTRFS_ATTR(discard, discard_bitmap_bytes, btrfs_discard_bitmap_bytes_show);
@@ -451,7 +451,7 @@ static ssize_t btrfs_discard_extent_bytes_show(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
- return scnprintf(buf, PAGE_SIZE, "%lld\n",
+ return scnprintf(buf, PAGE_SIZE, "%llu\n",
fs_info->discard_ctl.discard_extent_bytes);
}
BTRFS_ATTR(discard, discard_extent_bytes, btrfs_discard_extent_bytes_show);
@@ -665,15 +665,6 @@ static ssize_t btrfs_space_info_show_##field(struct kobject *kobj, \
} \
BTRFS_ATTR(space_info, field, btrfs_space_info_show_##field)
-static ssize_t btrfs_space_info_show_total_bytes_pinned(struct kobject *kobj,
- struct kobj_attribute *a,
- char *buf)
-{
- struct btrfs_space_info *sinfo = to_space_info(kobj);
- s64 val = percpu_counter_sum(&sinfo->total_bytes_pinned);
- return scnprintf(buf, PAGE_SIZE, "%lld\n", val);
-}
-
SPACE_INFO_ATTR(flags);
SPACE_INFO_ATTR(total_bytes);
SPACE_INFO_ATTR(bytes_used);
@@ -684,8 +675,6 @@ SPACE_INFO_ATTR(bytes_readonly);
SPACE_INFO_ATTR(bytes_zone_unusable);
SPACE_INFO_ATTR(disk_used);
SPACE_INFO_ATTR(disk_total);
-BTRFS_ATTR(space_info, total_bytes_pinned,
- btrfs_space_info_show_total_bytes_pinned);
static struct attribute *space_info_attrs[] = {
BTRFS_ATTR_PTR(space_info, flags),
@@ -698,7 +687,6 @@ static struct attribute *space_info_attrs[] = {
BTRFS_ATTR_PTR(space_info, bytes_zone_unusable),
BTRFS_ATTR_PTR(space_info, disk_used),
BTRFS_ATTR_PTR(space_info, disk_total),
- BTRFS_ATTR_PTR(space_info, total_bytes_pinned),
NULL,
};
ATTRIBUTE_GROUPS(space_info);
@@ -706,7 +694,6 @@ ATTRIBUTE_GROUPS(space_info);
static void space_info_release(struct kobject *kobj)
{
struct btrfs_space_info *sinfo = to_space_info(kobj);
- percpu_counter_destroy(&sinfo->total_bytes_pinned);
kfree(sinfo);
}
@@ -1455,6 +1442,33 @@ static ssize_t btrfs_devinfo_replace_target_show(struct kobject *kobj,
}
BTRFS_ATTR(devid, replace_target, btrfs_devinfo_replace_target_show);
+static ssize_t btrfs_devinfo_scrub_speed_max_show(struct kobject *kobj,
+ struct kobj_attribute *a,
+ char *buf)
+{
+ struct btrfs_device *device = container_of(kobj, struct btrfs_device,
+ devid_kobj);
+
+ return scnprintf(buf, PAGE_SIZE, "%llu\n",
+ READ_ONCE(device->scrub_speed_max));
+}
+
+static ssize_t btrfs_devinfo_scrub_speed_max_store(struct kobject *kobj,
+ struct kobj_attribute *a,
+ const char *buf, size_t len)
+{
+ struct btrfs_device *device = container_of(kobj, struct btrfs_device,
+ devid_kobj);
+ char *endptr;
+ unsigned long long limit;
+
+ limit = memparse(buf, &endptr);
+ WRITE_ONCE(device->scrub_speed_max, limit);
+ return len;
+}
+BTRFS_ATTR_RW(devid, scrub_speed_max, btrfs_devinfo_scrub_speed_max_show,
+ btrfs_devinfo_scrub_speed_max_store);
+
static ssize_t btrfs_devinfo_writeable_show(struct kobject *kobj,
struct kobj_attribute *a, char *buf)
{
@@ -1468,10 +1482,40 @@ static ssize_t btrfs_devinfo_writeable_show(struct kobject *kobj,
}
BTRFS_ATTR(devid, writeable, btrfs_devinfo_writeable_show);
+static ssize_t btrfs_devinfo_error_stats_show(struct kobject *kobj,
+ struct kobj_attribute *a, char *buf)
+{
+ struct btrfs_device *device = container_of(kobj, struct btrfs_device,
+ devid_kobj);
+
+ if (!device->dev_stats_valid)
+ return scnprintf(buf, PAGE_SIZE, "invalid\n");
+
+ /*
+ * Print all at once so we get a snapshot of all values from the same
+ * time. Keep them in sync and in order of definition of
+ * btrfs_dev_stat_values.
+ */
+ return scnprintf(buf, PAGE_SIZE,
+ "write_errs %d\n"
+ "read_errs %d\n"
+ "flush_errs %d\n"
+ "corruption_errs %d\n"
+ "generation_errs %d\n",
+ btrfs_dev_stat_read(device, BTRFS_DEV_STAT_WRITE_ERRS),
+ btrfs_dev_stat_read(device, BTRFS_DEV_STAT_READ_ERRS),
+ btrfs_dev_stat_read(device, BTRFS_DEV_STAT_FLUSH_ERRS),
+ btrfs_dev_stat_read(device, BTRFS_DEV_STAT_CORRUPTION_ERRS),
+ btrfs_dev_stat_read(device, BTRFS_DEV_STAT_GENERATION_ERRS));
+}
+BTRFS_ATTR(devid, error_stats, btrfs_devinfo_error_stats_show);
+
static struct attribute *devid_attrs[] = {
+ BTRFS_ATTR_PTR(devid, error_stats),
BTRFS_ATTR_PTR(devid, in_fs_metadata),
BTRFS_ATTR_PTR(devid, missing),
BTRFS_ATTR_PTR(devid, replace_target),
+ BTRFS_ATTR_PTR(devid, scrub_speed_max),
BTRFS_ATTR_PTR(devid, writeable),
NULL
};
diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c
index c0aefe6dee0b..319fed82d741 100644
--- a/fs/btrfs/tests/extent-map-tests.c
+++ b/fs/btrfs/tests/extent-map-tests.c
@@ -557,7 +557,7 @@ int btrfs_test_extent_map(void)
{
/*
* Test a chunk with 2 data stripes one of which
- * interesects the physical address of the super block
+ * intersects the physical address of the super block
* is correctly recognised.
*/
.raid_type = BTRFS_BLOCK_GROUP_RAID1,
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index f75de9f6c0ad..50318231c1a8 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -583,9 +583,6 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
bool do_chunk_alloc = false;
int ret;
- /* Send isn't supposed to start transactions. */
- ASSERT(current->journal_info != BTRFS_SEND_TRANS_STUB);
-
if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
return ERR_PTR(-EROFS);
@@ -1406,8 +1403,10 @@ int btrfs_defrag_root(struct btrfs_root *root)
while (1) {
trans = btrfs_start_transaction(root, 0);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ break;
+ }
ret = btrfs_defrag_leaves(trans, root);
@@ -1476,7 +1475,7 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans,
ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
if (ret) {
btrfs_abort_transaction(trans, ret);
- goto out;
+ return ret;
}
/*
@@ -1869,31 +1868,6 @@ int btrfs_transaction_blocked(struct btrfs_fs_info *info)
}
/*
- * wait for the current transaction commit to start and block subsequent
- * transaction joins
- */
-static void wait_current_trans_commit_start(struct btrfs_fs_info *fs_info,
- struct btrfs_transaction *trans)
-{
- wait_event(fs_info->transaction_blocked_wait,
- trans->state >= TRANS_STATE_COMMIT_START ||
- TRANS_ABORTED(trans));
-}
-
-/*
- * wait for the current transaction to start and then become unblocked.
- * caller holds ref.
- */
-static void wait_current_trans_commit_start_and_unblock(
- struct btrfs_fs_info *fs_info,
- struct btrfs_transaction *trans)
-{
- wait_event(fs_info->transaction_wait,
- trans->state >= TRANS_STATE_UNBLOCKED ||
- TRANS_ABORTED(trans));
-}
-
-/*
* commit transactions asynchronously. once btrfs_commit_transaction_async
* returns, any subsequent transaction will not be allowed to join.
*/
@@ -1920,8 +1894,7 @@ static void do_async_commit(struct work_struct *work)
kfree(ac);
}
-int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
- int wait_for_unblock)
+int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_async_commit *ac;
@@ -1953,13 +1926,13 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
__sb_writers_release(fs_info->sb, SB_FREEZE_FS);
schedule_work(&ac->work);
-
- /* wait for transaction to start and unblock */
- if (wait_for_unblock)
- wait_current_trans_commit_start_and_unblock(fs_info, cur_trans);
- else
- wait_current_trans_commit_start(fs_info, cur_trans);
-
+ /*
+ * Wait for the current transaction commit to start and block
+ * subsequent transaction joins
+ */
+ wait_event(fs_info->transaction_blocked_wait,
+ cur_trans->state >= TRANS_STATE_COMMIT_START ||
+ TRANS_ABORTED(cur_trans));
if (current->journal_info == trans)
current->journal_info = NULL;
@@ -2074,14 +2047,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
ASSERT(refcount_read(&trans->use_count) == 1);
- /*
- * Some places just start a transaction to commit it. We need to make
- * sure that if this commit fails that the abort code actually marks the
- * transaction as failed, so set trans->dirty to make the abort code do
- * the right thing.
- */
- trans->dirty = true;
-
/* Stop the commit early if ->aborted is set */
if (TRANS_ABORTED(cur_trans)) {
ret = cur_trans->aborted;
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 364cfbb4c5c5..07d76029f598 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -122,8 +122,6 @@ struct btrfs_transaction {
#define TRANS_EXTWRITERS (__TRANS_START | __TRANS_ATTACH)
-#define BTRFS_SEND_TRANS_STUB ((void *)1)
-
struct btrfs_trans_handle {
u64 transid;
u64 bytes_reserved;
@@ -143,7 +141,6 @@ struct btrfs_trans_handle {
bool allocating_chunk;
bool can_flush_pending_bgs;
bool reloc_reserved;
- bool dirty;
bool in_fsync;
struct btrfs_root *root;
struct btrfs_fs_info *fs_info;
@@ -227,8 +224,7 @@ void btrfs_add_dead_root(struct btrfs_root *root);
int btrfs_defrag_root(struct btrfs_root *root);
int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root);
int btrfs_commit_transaction(struct btrfs_trans_handle *trans);
-int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
- int wait_for_unblock);
+int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans);
int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans);
bool btrfs_should_end_transaction(struct btrfs_trans_handle *trans);
void btrfs_throttle(struct btrfs_fs_info *fs_info);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 14ec61048483..cab451d19547 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -1574,7 +1574,9 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
if (ret)
goto out;
- btrfs_update_inode(trans, root, BTRFS_I(inode));
+ ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+ if (ret)
+ goto out;
}
ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen;
@@ -1749,7 +1751,9 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
if (nlink != inode->i_nlink) {
set_nlink(inode, nlink);
- btrfs_update_inode(trans, root, BTRFS_I(inode));
+ ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+ if (ret)
+ goto out;
}
BTRFS_I(inode)->index_cnt = (u64)-1;
@@ -1787,6 +1791,7 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
break;
if (ret == 1) {
+ ret = 0;
if (path->slots[0] == 0)
break;
path->slots[0]--;
@@ -1799,17 +1804,19 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
ret = btrfs_del_item(trans, root, path);
if (ret)
- goto out;
+ break;
btrfs_release_path(path);
inode = read_one_inode(root, key.offset);
- if (!inode)
- return -EIO;
+ if (!inode) {
+ ret = -EIO;
+ break;
+ }
ret = fixup_inode_link_count(trans, root, inode);
iput(inode);
if (ret)
- goto out;
+ break;
/*
* fixup on a directory may create new entries,
@@ -1818,8 +1825,6 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
*/
key.offset = (u64)-1;
}
- ret = 0;
-out:
btrfs_release_path(path);
return ret;
}
@@ -1858,8 +1863,6 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
} else if (ret == -EEXIST) {
ret = 0;
- } else {
- BUG(); /* Logic Error */
}
iput(inode);
@@ -3299,6 +3302,22 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
* begins and releases it only after writing its superblock.
*/
mutex_lock(&fs_info->tree_log_mutex);
+
+ /*
+ * The previous transaction writeout phase could have failed, and thus
+ * marked the fs in an error state. We must not commit here, as we
+ * could have updated our generation in the super_for_commit and
+ * writing the super here would result in transid mismatches. If there
+ * is an error here just bail.
+ */
+ if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
+ ret = -EIO;
+ btrfs_set_log_full_commit(trans);
+ btrfs_abort_transaction(trans, ret);
+ mutex_unlock(&fs_info->tree_log_mutex);
+ goto out_wake_log_root;
+ }
+
btrfs_set_super_log_root(fs_info->super_for_commit, log_root_start);
btrfs_set_super_log_root_level(fs_info->super_for_commit, log_root_level);
ret = write_all_supers(fs_info, 1);
@@ -4449,7 +4468,8 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
ret = btrfs_truncate_inode_items(trans,
root->log_root,
inode, truncate_offset,
- BTRFS_EXTENT_DATA_KEY);
+ BTRFS_EXTENT_DATA_KEY,
+ NULL);
} while (ret == -EAGAIN);
if (ret)
goto out;
@@ -5397,7 +5417,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
&inode->runtime_flags);
while(1) {
ret = btrfs_truncate_inode_items(trans,
- log, inode, 0, 0);
+ log, inode, 0, 0, NULL);
if (ret != -EAGAIN)
break;
}
@@ -5447,13 +5467,23 @@ log_extents:
btrfs_release_path(dst_path);
if (need_log_inode_item) {
err = log_inode_item(trans, log, dst_path, inode);
- if (!err && !xattrs_logged) {
+ if (err)
+ goto out_unlock;
+ /*
+ * If we are doing a fast fsync and the inode was logged before
+ * in this transaction, we don't need to log the xattrs because
+ * they were logged before. If xattrs were added, changed or
+ * deleted since the last time we logged the inode, then we have
+ * already logged them because the inode had the runtime flag
+ * BTRFS_INODE_COPY_EVERYTHING set.
+ */
+ if (!xattrs_logged && inode->logged_trans < trans->transid) {
err = btrfs_log_all_xattrs(trans, root, inode, path,
dst_path);
+ if (err)
+ goto out_unlock;
btrfs_release_path(path);
}
- if (err)
- goto out_unlock;
}
if (fast_search) {
ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
@@ -6352,6 +6382,7 @@ next:
error:
if (wc.trans)
btrfs_end_transaction(wc.trans);
+ clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
btrfs_free_path(path);
return ret;
}
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 47d27059d064..807502cd6510 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -717,7 +717,7 @@ static struct btrfs_fs_devices *find_fsid_changed(
/*
* Handles the case where scanned device is part of an fs that had
- * multiple successful changes of FSID but curently device didn't
+ * multiple successful changes of FSID but currently device didn't
* observe it. Meaning our fsid will be different than theirs. We need
* to handle two subcases :
* 1 - The fs still continues to have different METADATA/FSID uuids.
@@ -1247,7 +1247,7 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
lockdep_assert_held(&uuid_mutex);
/*
* The device_list_mutex cannot be taken here in case opening the
- * underlying device takes further locks like bd_mutex.
+ * underlying device takes further locks like open_mutex.
*
* We also don't need the lock here as this is called during mount and
* exclusion is provided by uuid_mutex
@@ -1550,7 +1550,7 @@ static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start,
* check to ensure dev extents are not double allocated.
* This makes the function safe to allocate dev extents but may not report
* correct usable device space, as device extent freed in current transaction
- * is not reported as avaiable.
+ * is not reported as available.
*/
static int find_free_dev_extent_start(struct btrfs_device *device,
u64 num_bytes, u64 search_start, u64 *start,
@@ -4217,14 +4217,6 @@ int btrfs_balance(struct btrfs_fs_info *fs_info,
btrfs_bg_type_to_raid_name(data_target));
}
- if (fs_info->send_in_progress) {
- btrfs_warn_rl(fs_info,
-"cannot run balance while send operations are in progress (%d in progress)",
- fs_info->send_in_progress);
- ret = -EAGAIN;
- goto out;
- }
-
ret = insert_balance_item(fs_info, bctl);
if (ret && ret != -EEXIST)
goto out;
@@ -6127,17 +6119,17 @@ static bool need_full_stripe(enum btrfs_map_op op)
* @em: mapping containing the logical extent
* @op: type of operation - write or read
* @logical: address that we want to figure out the geometry of
- * @len: the length of IO we are going to perform, starting at @logical
* @io_geom: pointer used to return values
*
* Returns < 0 in case a chunk for the given logical address cannot be found,
* usually shouldn't happen unless @logical is corrupted, 0 otherwise.
*/
int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em,
- enum btrfs_map_op op, u64 logical, u64 len,
+ enum btrfs_map_op op, u64 logical,
struct btrfs_io_geometry *io_geom)
{
struct map_lookup *map;
+ u64 len;
u64 offset;
u64 stripe_offset;
u64 stripe_nr;
@@ -6152,7 +6144,7 @@ int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em,
offset = logical - em->start;
/* Len of a stripe in a chunk */
stripe_len = map->stripe_len;
- /* Stripe wher this block falls in */
+ /* Stripe where this block falls in */
stripe_nr = div64_u64(offset, stripe_len);
/* Offset of stripe in the chunk */
stripe_offset = stripe_nr * stripe_len;
@@ -6243,7 +6235,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
em = btrfs_get_chunk_map(fs_info, logical, *length);
ASSERT(!IS_ERR(em));
- ret = btrfs_get_io_geometry(fs_info, em, op, logical, *length, &geom);
+ ret = btrfs_get_io_geometry(fs_info, em, op, logical, &geom);
if (ret < 0)
return ret;
@@ -6670,8 +6662,6 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
*
* If devid and uuid are both specified, the match must be exact, otherwise
* only devid is used.
- *
- * If @seed is true, traverse through the seed devices.
*/
struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices,
u64 devid, u8 *uuid, u8 *fsid)
@@ -7865,7 +7855,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
ret = -EUCLEAN;
}
- /* Make sure no dev extent is beyond device bondary */
+ /* Make sure no dev extent is beyond device boundary */
dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL);
if (!dev) {
btrfs_err(fs_info, "failed to find devid %llu", devid);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 9c0d84e5ec06..c7fc7caf575c 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -143,6 +143,9 @@ struct btrfs_device {
struct completion kobj_unregister;
/* For sysfs/FSID/devinfo/devid/ */
struct kobject devid_kobj;
+
+ /* Bandwidth limit for scrub, in bytes */
+ u64 scrub_speed_max;
};
/*
@@ -443,7 +446,7 @@ int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
u64 logical, u64 *length,
struct btrfs_bio **bbio_ret);
int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *map,
- enum btrfs_map_op op, u64 logical, u64 len,
+ enum btrfs_map_op op, u64 logical,
struct btrfs_io_geometry *io_geom);
int btrfs_read_sys_array(struct btrfs_fs_info *fs_info);
int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info);
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 304ce64c70a4..297c0b1c0634 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -81,7 +81,7 @@ static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones,
* *: Special case, no superblock is written
* 0: Use write pointer of zones[0]
* 1: Use write pointer of zones[1]
- * C: Compare super blcoks from zones[0] and zones[1], use the latest
+ * C: Compare super blocks from zones[0] and zones[1], use the latest
* one determined by generation
* x: Invalid state
*/
@@ -150,6 +150,18 @@ static inline u32 sb_zone_number(int shift, int mirror)
return (u32)zone;
}
+static inline sector_t zone_start_sector(u32 zone_number,
+ struct block_device *bdev)
+{
+ return (sector_t)zone_number << ilog2(bdev_zone_sectors(bdev));
+}
+
+static inline u64 zone_start_physical(u32 zone_number,
+ struct btrfs_zoned_device_info *zone_info)
+{
+ return (u64)zone_number << zone_info->zone_size_shift;
+}
+
/*
* Emulate blkdev_report_zones() for a non-zoned device. It slices up the block
* device into static sized chunks and fake a conventional zone on each of
@@ -405,8 +417,8 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device)
if (sb_zone + 1 >= zone_info->nr_zones)
continue;
- sector = sb_zone << (zone_info->zone_size_shift - SECTOR_SHIFT);
- ret = btrfs_get_dev_zones(device, sector << SECTOR_SHIFT,
+ ret = btrfs_get_dev_zones(device,
+ zone_start_physical(sb_zone, zone_info),
&zone_info->sb_zones[sb_pos],
&nr_zones);
if (ret)
@@ -421,7 +433,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device)
}
/*
- * If zones[0] is conventional, always use the beggining of the
+ * If zones[0] is conventional, always use the beginning of the
* zone to record superblock. No need to validate in that case.
*/
if (zone_info->sb_zones[BTRFS_NR_SB_LOG_ZONES * i].type ==
@@ -721,7 +733,7 @@ int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw,
if (sb_zone + 1 >= nr_zones)
return -ENOENT;
- ret = blkdev_report_zones(bdev, sb_zone << zone_sectors_shift,
+ ret = blkdev_report_zones(bdev, zone_start_sector(sb_zone, bdev),
BTRFS_NR_SB_LOG_ZONES, copy_zone_info_cb,
zones);
if (ret < 0)
@@ -826,7 +838,7 @@ int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror)
return -ENOENT;
return blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
- sb_zone << zone_sectors_shift,
+ zone_start_sector(sb_zone, bdev),
zone_sectors * BTRFS_NR_SB_LOG_ZONES, GFP_NOFS);
}
@@ -878,7 +890,8 @@ u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start,
if (!(end <= sb_zone ||
sb_zone + BTRFS_NR_SB_LOG_ZONES <= begin)) {
have_sb = true;
- pos = ((u64)sb_zone + BTRFS_NR_SB_LOG_ZONES) << shift;
+ pos = zone_start_physical(
+ sb_zone + BTRFS_NR_SB_LOG_ZONES, zinfo);
break;
}
@@ -1127,6 +1140,10 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
}
if (zone.type == BLK_ZONE_TYPE_CONVENTIONAL) {
+ btrfs_err_in_rcu(fs_info,
+ "zoned: unexpected conventional zone %llu on device %s (devid %llu)",
+ zone.start << SECTOR_SHIFT,
+ rcu_str_deref(device->name), device->devid);
ret = -EIO;
goto out;
}
@@ -1187,6 +1204,13 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
case 0: /* single */
+ if (alloc_offsets[0] == WP_MISSING_DEV) {
+ btrfs_err(fs_info,
+ "zoned: cannot recover write pointer for zone %llu",
+ physical);
+ ret = -EIO;
+ goto out;
+ }
cache->alloc_offset = alloc_offsets[0];
break;
case BTRFS_BLOCK_GROUP_DUP:
@@ -1204,6 +1228,13 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
}
out:
+ if (cache->alloc_offset > fs_info->zone_size) {
+ btrfs_err(fs_info,
+ "zoned: invalid write pointer %llu in block group %llu",
+ cache->alloc_offset, cache->start);
+ ret = -EIO;
+ }
+
/* An extent is allocated after the write pointer */
if (!ret && num_conventional && last_alloc > cache->alloc_offset) {
btrfs_err(fs_info,
@@ -1278,7 +1309,7 @@ void btrfs_free_redirty_list(struct btrfs_transaction *trans)
spin_unlock(&trans->releasing_ebs_lock);
}
-bool btrfs_use_zone_append(struct btrfs_inode *inode, struct extent_map *em)
+bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_block_group *cache;
@@ -1293,7 +1324,7 @@ bool btrfs_use_zone_append(struct btrfs_inode *inode, struct extent_map *em)
if (!is_data_inode(&inode->vfs_inode))
return false;
- cache = btrfs_lookup_block_group(fs_info, em->block_start);
+ cache = btrfs_lookup_block_group(fs_info, start);
ASSERT(cache);
if (!cache)
return false;
@@ -1502,3 +1533,24 @@ int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical,
length = wp - physical_pos;
return btrfs_zoned_issue_zeroout(tgt_dev, physical_pos, length);
}
+
+struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 length)
+{
+ struct btrfs_device *device;
+ struct extent_map *em;
+ struct map_lookup *map;
+
+ em = btrfs_get_chunk_map(fs_info, logical, length);
+ if (IS_ERR(em))
+ return ERR_CAST(em);
+
+ map = em->map_lookup;
+ /* We only support single profile for now */
+ ASSERT(map->num_stripes == 1);
+ device = map->stripes[0].dev;
+
+ free_extent_map(em);
+
+ return device;
+}
diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
index 5e41a74a9cb2..b0ae2608cb6b 100644
--- a/fs/btrfs/zoned.h
+++ b/fs/btrfs/zoned.h
@@ -53,7 +53,7 @@ void btrfs_calc_zone_unusable(struct btrfs_block_group *cache);
void btrfs_redirty_list_add(struct btrfs_transaction *trans,
struct extent_buffer *eb);
void btrfs_free_redirty_list(struct btrfs_transaction *trans);
-bool btrfs_use_zone_append(struct btrfs_inode *inode, struct extent_map *em);
+bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start);
void btrfs_record_physical_zoned(struct inode *inode, u64 file_offset,
struct bio *bio);
void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered);
@@ -65,6 +65,8 @@ void btrfs_revert_meta_write_pointer(struct btrfs_block_group *cache,
int btrfs_zoned_issue_zeroout(struct btrfs_device *device, u64 physical, u64 length);
int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical,
u64 physical_start, u64 physical_pos);
+struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 length);
#else /* CONFIG_BLK_DEV_ZONED */
static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
struct blk_zone *zone)
@@ -152,8 +154,7 @@ static inline void btrfs_redirty_list_add(struct btrfs_transaction *trans,
struct extent_buffer *eb) { }
static inline void btrfs_free_redirty_list(struct btrfs_transaction *trans) { }
-static inline bool btrfs_use_zone_append(struct btrfs_inode *inode,
- struct extent_map *em)
+static inline bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start)
{
return false;
}
@@ -192,6 +193,13 @@ static inline int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev,
return -EOPNOTSUPP;
}
+static inline struct btrfs_device *btrfs_zoned_get_device(
+ struct btrfs_fs_info *fs_info,
+ u64 logical, u64 length)
+{
+ return ERR_PTR(-EOPNOTSUPP);
+}
+
#endif
static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos)
diff --git a/fs/buffer.c b/fs/buffer.c
index ea48c01fb76b..6290c3afdba4 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -589,31 +589,6 @@ void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
EXPORT_SYMBOL(mark_buffer_dirty_inode);
/*
- * Mark the page dirty, and set it dirty in the page cache, and mark the inode
- * dirty.
- *
- * If warn is true, then emit a warning if the page is not uptodate and has
- * not been truncated.
- *
- * The caller must hold lock_page_memcg().
- */
-void __set_page_dirty(struct page *page, struct address_space *mapping,
- int warn)
-{
- unsigned long flags;
-
- xa_lock_irqsave(&mapping->i_pages, flags);
- if (page->mapping) { /* Race with truncate? */
- WARN_ON_ONCE(warn && !PageUptodate(page));
- account_page_dirtied(page, mapping);
- __xa_set_mark(&mapping->i_pages, page_index(page),
- PAGECACHE_TAG_DIRTY);
- }
- xa_unlock_irqrestore(&mapping->i_pages, flags);
-}
-EXPORT_SYMBOL_GPL(__set_page_dirty);
-
-/*
* Add a page to the dirty page list.
*
* It is a sad fact of life that this function is called from several places
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 5624fae7a603..9ba79b6531fb 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -668,14 +668,13 @@ out:
* Handle lookups for the hidden .snap directory.
*/
struct dentry *ceph_handle_snapdir(struct ceph_mds_request *req,
- struct dentry *dentry, int err)
+ struct dentry *dentry)
{
struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
struct inode *parent = d_inode(dentry->d_parent); /* we hold i_mutex */
/* .snap dir? */
- if (err == -ENOENT &&
- ceph_snap(parent) == CEPH_NOSNAP &&
+ if (ceph_snap(parent) == CEPH_NOSNAP &&
strcmp(dentry->d_name.name, fsc->mount_options->snapdir_name) == 0) {
struct dentry *res;
struct inode *inode = ceph_get_snapdir(parent);
@@ -742,7 +741,6 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
struct ceph_mds_request *req;
- struct dentry *res;
int op;
int mask;
int err;
@@ -793,12 +791,16 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
req->r_parent = dir;
set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
err = ceph_mdsc_do_request(mdsc, NULL, req);
- res = ceph_handle_snapdir(req, dentry, err);
- if (IS_ERR(res)) {
- err = PTR_ERR(res);
- } else {
- dentry = res;
- err = 0;
+ if (err == -ENOENT) {
+ struct dentry *res;
+
+ res = ceph_handle_snapdir(req, dentry);
+ if (IS_ERR(res)) {
+ err = PTR_ERR(res);
+ } else {
+ dentry = res;
+ err = 0;
+ }
}
dentry = ceph_finish_lookup(req, dentry, err);
ceph_mdsc_put_request(req); /* will dput(dentry) */
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 77fc037d5beb..d51af3698032 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -578,6 +578,7 @@ static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry,
struct ceph_inode_info *ci = ceph_inode(dir);
struct inode *inode;
struct timespec64 now;
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
struct ceph_vino vino = { .ino = req->r_deleg_ino,
.snap = CEPH_NOSNAP };
@@ -615,8 +616,10 @@ static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry,
ceph_file_layout_to_legacy(lo, &in.layout);
+ down_read(&mdsc->snap_rwsem);
ret = ceph_fill_inode(inode, NULL, &iinfo, NULL, req->r_session,
req->r_fmode, NULL);
+ up_read(&mdsc->snap_rwsem);
if (ret) {
dout("%s failed to fill inode: %d\n", __func__, ret);
ceph_dir_clear_complete(dir);
@@ -739,14 +742,16 @@ retry:
err = ceph_mdsc_do_request(mdsc,
(flags & (O_CREAT|O_TRUNC)) ? dir : NULL,
req);
- dentry = ceph_handle_snapdir(req, dentry, err);
- if (IS_ERR(dentry)) {
- err = PTR_ERR(dentry);
- goto out_req;
+ if (err == -ENOENT) {
+ dentry = ceph_handle_snapdir(req, dentry);
+ if (IS_ERR(dentry)) {
+ err = PTR_ERR(dentry);
+ goto out_req;
+ }
+ err = 0;
}
- err = 0;
- if ((flags & O_CREAT) && !req->r_reply_info.head->is_dentry)
+ if (!err && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry)
err = ceph_handle_notrace_create(dir, dentry);
if (d_in_lookup(dentry)) {
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index e1c63adb196d..df0c8a724609 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -777,6 +777,8 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page,
umode_t mode = le32_to_cpu(info->mode);
dev_t rdev = le32_to_cpu(info->rdev);
+ lockdep_assert_held(&mdsc->snap_rwsem);
+
dout("%s %p ino %llx.%llx v %llu had %llu\n", __func__,
inode, ceph_vinop(inode), le64_to_cpu(info->version),
ci->i_version);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index db80d89556b1..839e6b0239ee 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -1218,7 +1218,7 @@ extern const struct dentry_operations ceph_dentry_ops;
extern loff_t ceph_make_fpos(unsigned high, unsigned off, bool hash_order);
extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry);
extern struct dentry *ceph_handle_snapdir(struct ceph_mds_request *req,
- struct dentry *dentry, int err);
+ struct dentry *dentry);
extern struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
struct dentry *dentry, int err);
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index bf52e9326ebe..7364950a9ef4 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -19,6 +19,8 @@ config CIFS
select CRYPTO_LIB_DES
select KEYS
select DNS_RESOLVER
+ select ASN1
+ select OID_REGISTRY
help
This is the client VFS module for the SMB3 family of NAS protocols,
(including support for the most recent, most secure dialect SMB3.1.1)
@@ -57,6 +59,7 @@ config CIFS
config CIFS_STATS2
bool "Extended statistics"
depends on CIFS
+ default y
help
Enabling this option will allow more detailed statistics on SMB
request timing to be displayed in /proc/fs/cifs/DebugData and also
@@ -65,8 +68,7 @@ config CIFS_STATS2
for more details. These additional statistics may have a minor effect
on performance and memory utilization.
- Unless you are a developer or are doing network performance analysis
- or tuning, say N.
+ If unsure, say Y.
config CIFS_ALLOW_INSECURE_LEGACY
bool "Support legacy servers which use less secure dialects"
diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile
index 3ee3b7de4ded..87fcacdf3de7 100644
--- a/fs/cifs/Makefile
+++ b/fs/cifs/Makefile
@@ -6,12 +6,16 @@ ccflags-y += -I$(src) # needed for trace events
obj-$(CONFIG_CIFS) += cifs.o
cifs-y := trace.o cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o \
- inode.o link.o misc.o netmisc.o smbencrypt.o transport.o asn1.o \
+ inode.o link.o misc.o netmisc.o smbencrypt.o transport.o \
cifs_unicode.o nterr.o cifsencrypt.o \
readdir.o ioctl.o sess.o export.o smb1ops.o unc.o winucase.o \
smb2ops.o smb2maperror.o smb2transport.o \
smb2misc.o smb2pdu.o smb2inode.o smb2file.o cifsacl.o fs_context.o \
- dns_resolve.o
+ dns_resolve.o cifs_spnego_negtokeninit.asn1.o asn1.o
+
+$(obj)/asn1.o: $(obj)/cifs_spnego_negtokeninit.asn1.h
+
+$(obj)/cifs_spnego_negtokeninit.asn1.o: $(obj)/cifs_spnego_negtokeninit.asn1.c $(obj)/cifs_spnego_negtokeninit.asn1.h
cifs-$(CONFIG_CIFS_XATTR) += xattr.o
diff --git a/fs/cifs/asn1.c b/fs/cifs/asn1.c
index 3150c19cdc2f..b5724ef9f182 100644
--- a/fs/cifs/asn1.c
+++ b/fs/cifs/asn1.c
@@ -1,612 +1,63 @@
// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * The ASB.1/BER parsing code is derived from ip_nat_snmp_basic.c which was in
- * turn derived from the gxsnmp package by Gregory McLean & Jochen Friedrich
- *
- * Copyright (c) 2000 RP Internet (www.rpi.net.au).
- */
#include <linux/module.h>
-#include <linux/types.h>
#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/slab.h>
-#include "cifspdu.h"
+#include <linux/oid_registry.h>
#include "cifsglob.h"
#include "cifs_debug.h"
#include "cifsproto.h"
+#include "cifs_spnego_negtokeninit.asn1.h"
-/*****************************************************************************
- *
- * Basic ASN.1 decoding routines (gxsnmp author Dirk Wisse)
- *
- *****************************************************************************/
-
-/* Class */
-#define ASN1_UNI 0 /* Universal */
-#define ASN1_APL 1 /* Application */
-#define ASN1_CTX 2 /* Context */
-#define ASN1_PRV 3 /* Private */
-
-/* Tag */
-#define ASN1_EOC 0 /* End Of Contents or N/A */
-#define ASN1_BOL 1 /* Boolean */
-#define ASN1_INT 2 /* Integer */
-#define ASN1_BTS 3 /* Bit String */
-#define ASN1_OTS 4 /* Octet String */
-#define ASN1_NUL 5 /* Null */
-#define ASN1_OJI 6 /* Object Identifier */
-#define ASN1_OJD 7 /* Object Description */
-#define ASN1_EXT 8 /* External */
-#define ASN1_ENUM 10 /* Enumerated */
-#define ASN1_SEQ 16 /* Sequence */
-#define ASN1_SET 17 /* Set */
-#define ASN1_NUMSTR 18 /* Numerical String */
-#define ASN1_PRNSTR 19 /* Printable String */
-#define ASN1_TEXSTR 20 /* Teletext String */
-#define ASN1_VIDSTR 21 /* Video String */
-#define ASN1_IA5STR 22 /* IA5 String */
-#define ASN1_UNITIM 23 /* Universal Time */
-#define ASN1_GENTIM 24 /* General Time */
-#define ASN1_GRASTR 25 /* Graphical String */
-#define ASN1_VISSTR 26 /* Visible String */
-#define ASN1_GENSTR 27 /* General String */
-
-/* Primitive / Constructed methods*/
-#define ASN1_PRI 0 /* Primitive */
-#define ASN1_CON 1 /* Constructed */
-
-/*
- * Error codes.
- */
-#define ASN1_ERR_NOERROR 0
-#define ASN1_ERR_DEC_EMPTY 2
-#define ASN1_ERR_DEC_EOC_MISMATCH 3
-#define ASN1_ERR_DEC_LENGTH_MISMATCH 4
-#define ASN1_ERR_DEC_BADVALUE 5
-
-#define SPNEGO_OID_LEN 7
-#define NTLMSSP_OID_LEN 10
-#define KRB5_OID_LEN 7
-#define KRB5U2U_OID_LEN 8
-#define MSKRB5_OID_LEN 7
-static unsigned long SPNEGO_OID[7] = { 1, 3, 6, 1, 5, 5, 2 };
-static unsigned long NTLMSSP_OID[10] = { 1, 3, 6, 1, 4, 1, 311, 2, 2, 10 };
-static unsigned long KRB5_OID[7] = { 1, 2, 840, 113554, 1, 2, 2 };
-static unsigned long KRB5U2U_OID[8] = { 1, 2, 840, 113554, 1, 2, 2, 3 };
-static unsigned long MSKRB5_OID[7] = { 1, 2, 840, 48018, 1, 2, 2 };
-
-/*
- * ASN.1 context.
- */
-struct asn1_ctx {
- int error; /* Error condition */
- unsigned char *pointer; /* Octet just to be decoded */
- unsigned char *begin; /* First octet */
- unsigned char *end; /* Octet after last octet */
-};
-
-/*
- * Octet string (not null terminated)
- */
-struct asn1_octstr {
- unsigned char *data;
- unsigned int len;
-};
-
-static void
-asn1_open(struct asn1_ctx *ctx, unsigned char *buf, unsigned int len)
-{
- ctx->begin = buf;
- ctx->end = buf + len;
- ctx->pointer = buf;
- ctx->error = ASN1_ERR_NOERROR;
-}
-
-static unsigned char
-asn1_octet_decode(struct asn1_ctx *ctx, unsigned char *ch)
-{
- if (ctx->pointer >= ctx->end) {
- ctx->error = ASN1_ERR_DEC_EMPTY;
- return 0;
- }
- *ch = *(ctx->pointer)++;
- return 1;
-}
-
-#if 0 /* will be needed later by spnego decoding/encoding of ntlmssp */
-static unsigned char
-asn1_enum_decode(struct asn1_ctx *ctx, __le32 *val)
-{
- unsigned char ch;
-
- if (ctx->pointer >= ctx->end) {
- ctx->error = ASN1_ERR_DEC_EMPTY;
- return 0;
- }
-
- ch = *(ctx->pointer)++; /* ch has 0xa, ptr points to length octet */
- if ((ch) == ASN1_ENUM) /* if ch value is ENUM, 0xa */
- *val = *(++(ctx->pointer)); /* value has enum value */
- else
- return 0;
-
- ctx->pointer++;
- return 1;
-}
-#endif
-
-static unsigned char
-asn1_tag_decode(struct asn1_ctx *ctx, unsigned int *tag)
-{
- unsigned char ch;
-
- *tag = 0;
-
- do {
- if (!asn1_octet_decode(ctx, &ch))
- return 0;
- *tag <<= 7;
- *tag |= ch & 0x7F;
- } while ((ch & 0x80) == 0x80);
- return 1;
-}
-
-static unsigned char
-asn1_id_decode(struct asn1_ctx *ctx,
- unsigned int *cls, unsigned int *con, unsigned int *tag)
-{
- unsigned char ch;
-
- if (!asn1_octet_decode(ctx, &ch))
- return 0;
-
- *cls = (ch & 0xC0) >> 6;
- *con = (ch & 0x20) >> 5;
- *tag = (ch & 0x1F);
-
- if (*tag == 0x1F) {
- if (!asn1_tag_decode(ctx, tag))
- return 0;
- }
- return 1;
-}
-
-static unsigned char
-asn1_length_decode(struct asn1_ctx *ctx, unsigned int *def, unsigned int *len)
-{
- unsigned char ch, cnt;
-
- if (!asn1_octet_decode(ctx, &ch))
- return 0;
-
- if (ch == 0x80)
- *def = 0;
- else {
- *def = 1;
-
- if (ch < 0x80)
- *len = ch;
- else {
- cnt = (unsigned char) (ch & 0x7F);
- *len = 0;
-
- while (cnt > 0) {
- if (!asn1_octet_decode(ctx, &ch))
- return 0;
- *len <<= 8;
- *len |= ch;
- cnt--;
- }
- }
- }
-
- /* don't trust len bigger than ctx buffer */
- if (*len > ctx->end - ctx->pointer)
- return 0;
-
- return 1;
-}
-
-static unsigned char
-asn1_header_decode(struct asn1_ctx *ctx,
- unsigned char **eoc,
- unsigned int *cls, unsigned int *con, unsigned int *tag)
-{
- unsigned int def = 0;
- unsigned int len = 0;
-
- if (!asn1_id_decode(ctx, cls, con, tag))
- return 0;
-
- if (!asn1_length_decode(ctx, &def, &len))
- return 0;
-
- /* primitive shall be definite, indefinite shall be constructed */
- if (*con == ASN1_PRI && !def)
- return 0;
-
- if (def)
- *eoc = ctx->pointer + len;
- else
- *eoc = NULL;
- return 1;
-}
-
-static unsigned char
-asn1_eoc_decode(struct asn1_ctx *ctx, unsigned char *eoc)
+int
+decode_negTokenInit(unsigned char *security_blob, int length,
+ struct TCP_Server_Info *server)
{
- unsigned char ch;
-
- if (eoc == NULL) {
- if (!asn1_octet_decode(ctx, &ch))
- return 0;
-
- if (ch != 0x00) {
- ctx->error = ASN1_ERR_DEC_EOC_MISMATCH;
- return 0;
- }
-
- if (!asn1_octet_decode(ctx, &ch))
- return 0;
-
- if (ch != 0x00) {
- ctx->error = ASN1_ERR_DEC_EOC_MISMATCH;
- return 0;
- }
- return 1;
- } else {
- if (ctx->pointer != eoc) {
- ctx->error = ASN1_ERR_DEC_LENGTH_MISMATCH;
- return 0;
- }
+ if (asn1_ber_decoder(&cifs_spnego_negtokeninit_decoder, server,
+ security_blob, length) == 0)
return 1;
- }
-}
-
-/* static unsigned char asn1_null_decode(struct asn1_ctx *ctx,
- unsigned char *eoc)
-{
- ctx->pointer = eoc;
- return 1;
-}
-
-static unsigned char asn1_long_decode(struct asn1_ctx *ctx,
- unsigned char *eoc, long *integer)
-{
- unsigned char ch;
- unsigned int len;
-
- if (!asn1_octet_decode(ctx, &ch))
- return 0;
-
- *integer = (signed char) ch;
- len = 1;
-
- while (ctx->pointer < eoc) {
- if (++len > sizeof(long)) {
- ctx->error = ASN1_ERR_DEC_BADVALUE;
- return 0;
- }
-
- if (!asn1_octet_decode(ctx, &ch))
- return 0;
-
- *integer <<= 8;
- *integer |= ch;
- }
- return 1;
-}
-
-static unsigned char asn1_uint_decode(struct asn1_ctx *ctx,
- unsigned char *eoc,
- unsigned int *integer)
-{
- unsigned char ch;
- unsigned int len;
-
- if (!asn1_octet_decode(ctx, &ch))
- return 0;
-
- *integer = ch;
- if (ch == 0)
- len = 0;
else
- len = 1;
-
- while (ctx->pointer < eoc) {
- if (++len > sizeof(unsigned int)) {
- ctx->error = ASN1_ERR_DEC_BADVALUE;
- return 0;
- }
-
- if (!asn1_octet_decode(ctx, &ch))
- return 0;
-
- *integer <<= 8;
- *integer |= ch;
- }
- return 1;
-}
-
-static unsigned char asn1_ulong_decode(struct asn1_ctx *ctx,
- unsigned char *eoc,
- unsigned long *integer)
-{
- unsigned char ch;
- unsigned int len;
-
- if (!asn1_octet_decode(ctx, &ch))
return 0;
-
- *integer = ch;
- if (ch == 0)
- len = 0;
- else
- len = 1;
-
- while (ctx->pointer < eoc) {
- if (++len > sizeof(unsigned long)) {
- ctx->error = ASN1_ERR_DEC_BADVALUE;
- return 0;
- }
-
- if (!asn1_octet_decode(ctx, &ch))
- return 0;
-
- *integer <<= 8;
- *integer |= ch;
- }
- return 1;
}
-static unsigned char
-asn1_octets_decode(struct asn1_ctx *ctx,
- unsigned char *eoc,
- unsigned char **octets, unsigned int *len)
+int cifs_gssapi_this_mech(void *context, size_t hdrlen,
+ unsigned char tag, const void *value, size_t vlen)
{
- unsigned char *ptr;
-
- *len = 0;
-
- *octets = kmalloc(eoc - ctx->pointer, GFP_ATOMIC);
- if (*octets == NULL) {
- return 0;
- }
-
- ptr = *octets;
- while (ctx->pointer < eoc) {
- if (!asn1_octet_decode(ctx, (unsigned char *) ptr++)) {
- kfree(*octets);
- *octets = NULL;
- return 0;
- }
- (*len)++;
- }
- return 1;
-} */
-
-static unsigned char
-asn1_subid_decode(struct asn1_ctx *ctx, unsigned long *subid)
-{
- unsigned char ch;
-
- *subid = 0;
-
- do {
- if (!asn1_octet_decode(ctx, &ch))
- return 0;
-
- *subid <<= 7;
- *subid |= ch & 0x7F;
- } while ((ch & 0x80) == 0x80);
- return 1;
-}
-
-static int
-asn1_oid_decode(struct asn1_ctx *ctx,
- unsigned char *eoc, unsigned long **oid, unsigned int *len)
-{
- unsigned long subid;
- unsigned int size;
- unsigned long *optr;
-
- size = eoc - ctx->pointer + 1;
-
- /* first subid actually encodes first two subids */
- if (size < 2 || size > UINT_MAX/sizeof(unsigned long))
- return 0;
-
- *oid = kmalloc_array(size, sizeof(unsigned long), GFP_ATOMIC);
- if (*oid == NULL)
- return 0;
-
- optr = *oid;
-
- if (!asn1_subid_decode(ctx, &subid)) {
- kfree(*oid);
- *oid = NULL;
- return 0;
- }
-
- if (subid < 40) {
- optr[0] = 0;
- optr[1] = subid;
- } else if (subid < 80) {
- optr[0] = 1;
- optr[1] = subid - 40;
- } else {
- optr[0] = 2;
- optr[1] = subid - 80;
- }
-
- *len = 2;
- optr += 2;
+ enum OID oid;
- while (ctx->pointer < eoc) {
- if (++(*len) > size) {
- ctx->error = ASN1_ERR_DEC_BADVALUE;
- kfree(*oid);
- *oid = NULL;
- return 0;
- }
+ oid = look_up_OID(value, vlen);
+ if (oid != OID_spnego) {
+ char buf[50];
- if (!asn1_subid_decode(ctx, optr++)) {
- kfree(*oid);
- *oid = NULL;
- return 0;
- }
+ sprint_oid(value, vlen, buf, sizeof(buf));
+ cifs_dbg(FYI, "Error decoding negTokenInit header: unexpected OID %s\n",
+ buf);
+ return -EBADMSG;
}
- return 1;
+ return 0;
}
-static int
-compare_oid(unsigned long *oid1, unsigned int oid1len,
- unsigned long *oid2, unsigned int oid2len)
+int cifs_neg_token_init_mech_type(void *context, size_t hdrlen,
+ unsigned char tag,
+ const void *value, size_t vlen)
{
- unsigned int i;
+ struct TCP_Server_Info *server = context;
+ enum OID oid;
- if (oid1len != oid2len)
- return 0;
+ oid = look_up_OID(value, vlen);
+ if (oid == OID_mskrb5)
+ server->sec_mskerberos = true;
+ else if (oid == OID_krb5u2u)
+ server->sec_kerberosu2u = true;
+ else if (oid == OID_krb5)
+ server->sec_kerberos = true;
+ else if (oid == OID_ntlmssp)
+ server->sec_ntlmssp = true;
else {
- for (i = 0; i < oid1len; i++) {
- if (oid1[i] != oid2[i])
- return 0;
- }
- return 1;
- }
-}
-
- /* BB check for endian conversion issues here */
-
-int
-decode_negTokenInit(unsigned char *security_blob, int length,
- struct TCP_Server_Info *server)
-{
- struct asn1_ctx ctx;
- unsigned char *end;
- unsigned char *sequence_end;
- unsigned long *oid = NULL;
- unsigned int cls, con, tag, oidlen, rc;
-
- /* cifs_dump_mem(" Received SecBlob ", security_blob, length); */
-
- asn1_open(&ctx, security_blob, length);
+ char buf[50];
- /* GSSAPI header */
- if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
- cifs_dbg(FYI, "Error decoding negTokenInit header\n");
- return 0;
- } else if ((cls != ASN1_APL) || (con != ASN1_CON)
- || (tag != ASN1_EOC)) {
- cifs_dbg(FYI, "cls = %d con = %d tag = %d\n", cls, con, tag);
- return 0;
+ sprint_oid(value, vlen, buf, sizeof(buf));
+ cifs_dbg(FYI, "Decoding negTokenInit: unsupported OID %s\n",
+ buf);
}
-
- /* Check for SPNEGO OID -- remember to free obj->oid */
- rc = asn1_header_decode(&ctx, &end, &cls, &con, &tag);
- if (rc) {
- if ((tag == ASN1_OJI) && (con == ASN1_PRI) &&
- (cls == ASN1_UNI)) {
- rc = asn1_oid_decode(&ctx, end, &oid, &oidlen);
- if (rc) {
- rc = compare_oid(oid, oidlen, SPNEGO_OID,
- SPNEGO_OID_LEN);
- kfree(oid);
- }
- } else
- rc = 0;
- }
-
- /* SPNEGO OID not present or garbled -- bail out */
- if (!rc) {
- cifs_dbg(FYI, "Error decoding negTokenInit header\n");
- return 0;
- }
-
- /* SPNEGO */
- if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
- cifs_dbg(FYI, "Error decoding negTokenInit\n");
- return 0;
- } else if ((cls != ASN1_CTX) || (con != ASN1_CON)
- || (tag != ASN1_EOC)) {
- cifs_dbg(FYI, "cls = %d con = %d tag = %d end = %p exit 0\n",
- cls, con, tag, end);
- return 0;
- }
-
- /* negTokenInit */
- if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
- cifs_dbg(FYI, "Error decoding negTokenInit\n");
- return 0;
- } else if ((cls != ASN1_UNI) || (con != ASN1_CON)
- || (tag != ASN1_SEQ)) {
- cifs_dbg(FYI, "cls = %d con = %d tag = %d end = %p exit 1\n",
- cls, con, tag, end);
- return 0;
- }
-
- /* sequence */
- if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
- cifs_dbg(FYI, "Error decoding 2nd part of negTokenInit\n");
- return 0;
- } else if ((cls != ASN1_CTX) || (con != ASN1_CON)
- || (tag != ASN1_EOC)) {
- cifs_dbg(FYI, "cls = %d con = %d tag = %d end = %p exit 0\n",
- cls, con, tag, end);
- return 0;
- }
-
- /* sequence of */
- if (asn1_header_decode
- (&ctx, &sequence_end, &cls, &con, &tag) == 0) {
- cifs_dbg(FYI, "Error decoding 2nd part of negTokenInit\n");
- return 0;
- } else if ((cls != ASN1_UNI) || (con != ASN1_CON)
- || (tag != ASN1_SEQ)) {
- cifs_dbg(FYI, "cls = %d con = %d tag = %d sequence_end = %p exit 1\n",
- cls, con, tag, sequence_end);
- return 0;
- }
-
- /* list of security mechanisms */
- while (!asn1_eoc_decode(&ctx, sequence_end)) {
- rc = asn1_header_decode(&ctx, &end, &cls, &con, &tag);
- if (!rc) {
- cifs_dbg(FYI, "Error decoding negTokenInit hdr exit2\n");
- return 0;
- }
- if ((tag == ASN1_OJI) && (con == ASN1_PRI)) {
- if (asn1_oid_decode(&ctx, end, &oid, &oidlen)) {
-
- cifs_dbg(FYI, "OID len = %d oid = 0x%lx 0x%lx 0x%lx 0x%lx\n",
- oidlen, *oid, *(oid + 1), *(oid + 2),
- *(oid + 3));
-
- if (compare_oid(oid, oidlen, MSKRB5_OID,
- MSKRB5_OID_LEN))
- server->sec_mskerberos = true;
- else if (compare_oid(oid, oidlen, KRB5U2U_OID,
- KRB5U2U_OID_LEN))
- server->sec_kerberosu2u = true;
- else if (compare_oid(oid, oidlen, KRB5_OID,
- KRB5_OID_LEN))
- server->sec_kerberos = true;
- else if (compare_oid(oid, oidlen, NTLMSSP_OID,
- NTLMSSP_OID_LEN))
- server->sec_ntlmssp = true;
-
- kfree(oid);
- }
- } else {
- cifs_dbg(FYI, "Should be an oid what is going on?\n");
- }
- }
-
- /*
- * We currently ignore anything at the end of the SPNEGO blob after
- * the mechTypes have been parsed, since none of that info is
- * used at the moment.
- */
- return 1;
+ return 0;
}
diff --git a/fs/cifs/cache.c b/fs/cifs/cache.c
index 488fe0ffc1ef..8a3b30ec860c 100644
--- a/fs/cifs/cache.c
+++ b/fs/cifs/cache.c
@@ -1,22 +1,10 @@
+// SPDX-License-Identifier: LGPL-2.1
/*
* fs/cifs/cache.c - CIFS filesystem cache index structure definitions
*
* Copyright (c) 2010 Novell, Inc.
* Authors(s): Suresh Jayaraman (sjayaraman@suse.de>
*
- * This library is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published
- * by the Free Software Foundation; either version 2.1 of the License, or
- * (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
- * the GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License
- * along with this library; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include "fscache.h"
#include "cifs_debug.h"
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 68e8e5b27841..8857ac7e7a14 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -50,7 +50,6 @@ void cifs_dump_detail(void *buf, struct TCP_Server_Info *server)
void cifs_dump_mids(struct TCP_Server_Info *server)
{
#ifdef CONFIG_CIFS_DEBUG2
- struct list_head *tmp;
struct mid_q_entry *mid_entry;
if (server == NULL)
@@ -58,8 +57,7 @@ void cifs_dump_mids(struct TCP_Server_Info *server)
cifs_dbg(VFS, "Dump pending requests:\n");
spin_lock(&GlobalMid_Lock);
- list_for_each(tmp, &server->pending_mid_q) {
- mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
+ list_for_each_entry(mid_entry, &server->pending_mid_q, qhead) {
cifs_dbg(VFS, "State: %d Cmd: %d Pid: %d Cbdata: %p Mid %llu\n",
mid_entry->mid_state,
le16_to_cpu(mid_entry->command),
@@ -168,7 +166,7 @@ cifs_dump_iface(struct seq_file *m, struct cifs_server_iface *iface)
static int cifs_debug_files_proc_show(struct seq_file *m, void *v)
{
- struct list_head *stmp, *tmp, *tmp1, *tmp2;
+ struct list_head *tmp, *tmp1, *tmp2;
struct TCP_Server_Info *server;
struct cifs_ses *ses;
struct cifs_tcon *tcon;
@@ -183,9 +181,7 @@ static int cifs_debug_files_proc_show(struct seq_file *m, void *v)
seq_printf(m, " <filename>\n");
#endif /* CIFS_DEBUG2 */
spin_lock(&cifs_tcp_ses_lock);
- list_for_each(stmp, &cifs_tcp_ses_list) {
- server = list_entry(stmp, struct TCP_Server_Info,
- tcp_ses_list);
+ list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) {
list_for_each(tmp, &server->smb_ses_list) {
ses = list_entry(tmp, struct cifs_ses, smb_ses_list);
list_for_each(tmp1, &ses->tcon_list) {
@@ -220,7 +216,7 @@ static int cifs_debug_files_proc_show(struct seq_file *m, void *v)
static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
{
- struct list_head *tmp1, *tmp2, *tmp3;
+ struct list_head *tmp2, *tmp3;
struct mid_q_entry *mid_entry;
struct TCP_Server_Info *server;
struct cifs_ses *ses;
@@ -278,11 +274,7 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
c = 0;
spin_lock(&cifs_tcp_ses_lock);
- list_for_each(tmp1, &cifs_tcp_ses_list) {
- server = list_entry(tmp1, struct TCP_Server_Info,
- tcp_ses_list);
-
- /* channel info will be printed as a part of sessions below */
+ list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) {
if (server->is_channel)
continue;
@@ -563,7 +555,7 @@ static int cifs_stats_proc_show(struct seq_file *m, void *v)
#ifdef CONFIG_CIFS_STATS2
int j;
#endif /* STATS2 */
- struct list_head *tmp1, *tmp2, *tmp3;
+ struct list_head *tmp2, *tmp3;
struct TCP_Server_Info *server;
struct cifs_ses *ses;
struct cifs_tcon *tcon;
@@ -594,9 +586,7 @@ static int cifs_stats_proc_show(struct seq_file *m, void *v)
i = 0;
spin_lock(&cifs_tcp_ses_lock);
- list_for_each(tmp1, &cifs_tcp_ses_list) {
- server = list_entry(tmp1, struct TCP_Server_Info,
- tcp_ses_list);
+ list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) {
seq_printf(m, "\nMax requests in flight: %d", server->max_in_flight);
#ifdef CONFIG_CIFS_STATS2
seq_puts(m, "\nTotal time spent processing by command. Time ");
diff --git a/fs/cifs/cifs_debug.h b/fs/cifs/cifs_debug.h
index 5e66dab712d0..ee4ea2b60c0f 100644
--- a/fs/cifs/cifs_debug.h
+++ b/fs/cifs/cifs_debug.h
@@ -3,7 +3,7 @@
*
* Copyright (c) International Business Machines Corp., 2000,2002
* Modified by Steve French (sfrench@us.ibm.com)
-*/
+ */
#ifndef _H_CIFS_DEBUG
#define _H_CIFS_DEBUG
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index c87c37cf2914..ec57cdb1590f 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -125,7 +125,7 @@ cifs_build_devname(char *nodename, const char *prepath)
* @sb_mountdata: parent/root DFS mount options (template)
* @fullpath: full path in UNC format
* @ref: optional server's referral
- *
+ * @devname: return the built cifs device name if passed pointer not NULL
* creates mount options for submount based on template options sb_mountdata
* and replacing unc,ip,prefixpath options with ones we've got form ref_unc.
*
diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index 9c45b3a82ad9..4fd788586399 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -1,19 +1,10 @@
+/* SPDX-License-Identifier: LGPL-2.1 */
/*
* fs/cifs/cifs_fs_sb.h
*
* Copyright (c) International Business Machines Corp., 2002,2004
* Author(s): Steve French (sfrench@us.ibm.com)
*
- * This library is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published
- * by the Free Software Foundation; either version 2.1 of the License, or
- * (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
- * the GNU Lesser General Public License for more details.
- *
*/
#include <linux/rbtree.h>
@@ -72,11 +63,12 @@ struct cifs_sb_info {
char *prepath;
/*
- * Path initially provided by the mount call. We might connect
- * to something different via DFS but we want to keep it to do
- * failover properly.
+ * Canonical DFS path initially provided by the mount call. We might connect to something
+ * different via DFS but we want to keep it to do failover properly.
*/
char *origin_fullpath; /* \\HOST\SHARE\[OPTIONAL PATH] */
+ /* randomly generated 128-bit number for indexing dfs mount groups in referral cache */
+ uuid_t dfs_mount_id;
/*
* Indicate whether serverino option was turned off later
* (cifs_autodisable_serverino) in order to match new mounts.
diff --git a/fs/cifs/cifs_ioctl.h b/fs/cifs/cifs_ioctl.h
index 4a97fe12006b..ef723be358af 100644
--- a/fs/cifs/cifs_ioctl.h
+++ b/fs/cifs/cifs_ioctl.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: LGPL-2.1 */
/*
* fs/cifs/cifs_ioctl.h
*
@@ -5,16 +6,6 @@
*
* Copyright (c) 2015 Steve French <steve.french@primarydata.com>
*
- * This library is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published
- * by the Free Software Foundation; either version 2.1 of the License, or
- * (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
- * the GNU Lesser General Public License for more details.
- *
*/
struct smb_mnt_fs_info {
@@ -72,15 +63,28 @@ struct smb3_key_debug_info {
} __packed;
/*
- * Dump full key (32 byte encrypt/decrypt keys instead of 16 bytes)
- * is needed if GCM256 (stronger encryption) negotiated
+ * Dump variable-sized keys
*/
struct smb3_full_key_debug_info {
- __u64 Suid;
+ /* INPUT: size of userspace buffer */
+ __u32 in_size;
+
+ /*
+ * INPUT: 0 for current user, otherwise session to dump
+ * OUTPUT: session id that was dumped
+ */
+ __u64 session_id;
__u16 cipher_type;
- __u8 auth_key[16]; /* SMB2_NTLMV2_SESSKEY_SIZE */
- __u8 smb3encryptionkey[32]; /* SMB3_ENC_DEC_KEY_SIZE */
- __u8 smb3decryptionkey[32]; /* SMB3_ENC_DEC_KEY_SIZE */
+ __u8 session_key_length;
+ __u8 server_in_key_length;
+ __u8 server_out_key_length;
+ __u8 data[];
+ /*
+ * return this struct with the keys appended at the end:
+ * __u8 session_key[session_key_length];
+ * __u8 server_in_key[server_in_key_length];
+ * __u8 server_out_key[server_out_key_length];
+ */
} __packed;
struct smb3_notify {
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 7b9b876b513b..8fa26a8530f8 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -1,22 +1,10 @@
+// SPDX-License-Identifier: LGPL-2.1
/*
* fs/cifs/cifs_spnego.c -- SPNEGO upcall management for CIFS
*
* Copyright (c) 2007 Red Hat, Inc.
* Author(s): Jeff Layton (jlayton@redhat.com)
*
- * This library is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published
- * by the Free Software Foundation; either version 2.1 of the License, or
- * (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
- * the GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License
- * along with this library; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <linux/list.h>
diff --git a/fs/cifs/cifs_spnego.h b/fs/cifs/cifs_spnego.h
index 31bef9ee078b..31387d0ea32e 100644
--- a/fs/cifs/cifs_spnego.h
+++ b/fs/cifs/cifs_spnego.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: LGPL-2.1 */
/*
* fs/cifs/cifs_spnego.h -- SPNEGO upcall management for CIFS
*
@@ -5,19 +6,6 @@
* Author(s): Jeff Layton (jlayton@redhat.com)
* Steve French (sfrench@us.ibm.com)
*
- * This library is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published
- * by the Free Software Foundation; either version 2.1 of the License, or
- * (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
- * the GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License
- * along with this library; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#ifndef _CIFS_SPNEGO_H
diff --git a/fs/cifs/cifs_spnego_negtokeninit.asn1 b/fs/cifs/cifs_spnego_negtokeninit.asn1
new file mode 100644
index 000000000000..181c083887d5
--- /dev/null
+++ b/fs/cifs/cifs_spnego_negtokeninit.asn1
@@ -0,0 +1,40 @@
+GSSAPI ::=
+ [APPLICATION 0] IMPLICIT SEQUENCE {
+ thisMech
+ OBJECT IDENTIFIER ({cifs_gssapi_this_mech}),
+ negotiationToken
+ NegotiationToken
+ }
+
+MechType ::= OBJECT IDENTIFIER ({cifs_neg_token_init_mech_type})
+
+MechTypeList ::= SEQUENCE OF MechType
+
+NegHints ::= SEQUENCE {
+ hintName
+ [0] GeneralString OPTIONAL,
+ hintAddress
+ [1] OCTET STRING OPTIONAL
+ }
+
+NegTokenInit2 ::=
+ SEQUENCE {
+ mechTypes
+ [0] MechTypeList OPTIONAL,
+ reqFlags
+ [1] BIT STRING OPTIONAL,
+ mechToken
+ [2] OCTET STRING OPTIONAL,
+ negHints
+ [3] NegHints OPTIONAL,
+ mechListMIC
+ [3] OCTET STRING OPTIONAL
+ }
+
+NegotiationToken ::=
+ CHOICE {
+ negTokenInit
+ [0] NegTokenInit2,
+ negTokenTarg
+ [1] ANY
+ }
diff --git a/fs/cifs/cifs_swn.c b/fs/cifs/cifs_swn.c
index d829b8bf833e..93b47818c6c2 100644
--- a/fs/cifs/cifs_swn.c
+++ b/fs/cifs/cifs_swn.c
@@ -447,15 +447,13 @@ static int cifs_swn_store_swn_addr(const struct sockaddr_storage *new,
const struct sockaddr_storage *old,
struct sockaddr_storage *dst)
{
- __be16 port;
+ __be16 port = cpu_to_be16(CIFS_PORT);
if (old->ss_family == AF_INET) {
struct sockaddr_in *ipv4 = (struct sockaddr_in *)old;
port = ipv4->sin_port;
- }
-
- if (old->ss_family == AF_INET6) {
+ } else if (old->ss_family == AF_INET6) {
struct sockaddr_in6 *ipv6 = (struct sockaddr_in6 *)old;
port = ipv6->sin6_port;
@@ -465,9 +463,7 @@ static int cifs_swn_store_swn_addr(const struct sockaddr_storage *new,
struct sockaddr_in *ipv4 = (struct sockaddr_in *)new;
ipv4->sin_port = port;
- }
-
- if (new->ss_family == AF_INET6) {
+ } else if (new->ss_family == AF_INET6) {
struct sockaddr_in6 *ipv6 = (struct sockaddr_in6 *)new;
ipv6->sin6_port = port;
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 784407f9280f..388eb536cff1 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: LGPL-2.1
/*
* fs/cifs/cifsacl.c
*
@@ -6,19 +7,6 @@
*
* Contains the routines for mapping CIFS/NTFS ACLs
*
- * This library is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published
- * by the Free Software Foundation; either version 2.1 of the License, or
- * (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
- * the GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License
- * along with this library; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <linux/fs.h>
@@ -409,7 +397,6 @@ try_upcall_to_get_id:
saved_cred = override_creds(root_cred);
sidkey = request_key(&cifs_idmap_key_type, sidstr, "");
if (IS_ERR(sidkey)) {
- rc = -EINVAL;
cifs_dbg(FYI, "%s: Can't map SID %s to a %cid\n",
__func__, sidstr, sidtype == SIDOWNER ? 'u' : 'g');
goto out_revert_creds;
@@ -422,7 +409,6 @@ try_upcall_to_get_id:
*/
BUILD_BUG_ON(sizeof(uid_t) != sizeof(gid_t));
if (sidkey->datalen != sizeof(uid_t)) {
- rc = -EIO;
cifs_dbg(FYI, "%s: Downcall contained malformed key (datalen=%hu)\n",
__func__, sidkey->datalen);
key_invalidate(sidkey);
@@ -1308,7 +1294,7 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd,
ndacl_ptr = (struct cifs_acl *)((char *)pnntsd + ndacloffset);
ndacl_ptr->revision =
dacloffset ? dacl_ptr->revision : cpu_to_le16(ACL_REVISION);
- ndacl_ptr->num_aces = dacl_ptr->num_aces;
+ ndacl_ptr->num_aces = dacl_ptr ? dacl_ptr->num_aces : 0;
if (uid_valid(uid)) { /* chown */
uid_t id;
diff --git a/fs/cifs/cifsacl.h b/fs/cifs/cifsacl.h
index d9e704979d99..f8292bcf8594 100644
--- a/fs/cifs/cifsacl.h
+++ b/fs/cifs/cifsacl.h
@@ -1,28 +1,15 @@
+/* SPDX-License-Identifier: LGPL-2.1 */
/*
* fs/cifs/cifsacl.h
*
* Copyright (c) International Business Machines Corp., 2007
* Author(s): Steve French (sfrench@us.ibm.com)
*
- * This library is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published
- * by the Free Software Foundation; either version 2.1 of the License, or
- * (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
- * the GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License
- * along with this library; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#ifndef _CIFSACL_H
#define _CIFSACL_H
-
#define NUM_AUTHS (6) /* number of authority fields */
#define SID_MAX_SUB_AUTHORITIES (15) /* max number of sub authority fields */
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index b8f1ff9a83f3..ecf15d845dbd 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: LGPL-2.1
/*
* fs/cifs/cifsencrypt.c
*
@@ -7,19 +8,6 @@
* Copyright (C) International Business Machines Corp., 2005,2013
* Author(s): Steve French (sfrench@us.ibm.com)
*
- * This library is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published
- * by the Free Software Foundation; either version 2.1 of the License, or
- * (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
- * the GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License
- * along with this library; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <linux/fs.h>
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index d7ea9c5fe0f8..9fb874dd8d24 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: LGPL-2.1
/*
* fs/cifs/cifsfs.c
*
@@ -6,19 +7,6 @@
*
* Common Internet FileSystem (CIFS) client
*
- * This library is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published
- * by the Free Software Foundation; either version 2.1 of the License, or
- * (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
- * the GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License
- * along with this library; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
/* Note that BB means BUGBUG (ie something to fix eventually) */
@@ -133,7 +121,7 @@ struct workqueue_struct *cifsiod_wq;
struct workqueue_struct *decrypt_wq;
struct workqueue_struct *fileinfo_put_wq;
struct workqueue_struct *cifsoplockd_wq;
-struct workqueue_struct *deferredclose_wq;
+struct workqueue_struct *deferredclose_wq;
__u32 cifs_lock_secret;
/*
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 6beddb108ba0..177f3e7ab86d 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -1,22 +1,10 @@
+/* SPDX-License-Identifier: LGPL-2.1 */
/*
* fs/cifs/cifsfs.h
*
* Copyright (c) International Business Machines Corp., 2002, 2007
* Author(s): Steve French (sfrench@us.ibm.com)
*
- * This library is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published
- * by the Free Software Foundation; either version 2.1 of the License, or
- * (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
- * the GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License
- * along with this library; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#ifndef _CIFSFS_H
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index d88b4b523dcc..3100f8b66e60 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: LGPL-2.1 */
/*
* fs/cifs/cifsglob.h
*
@@ -5,16 +6,6 @@
* Author(s): Steve French (sfrench@us.ibm.com)
* Jeremy Allison (jra@samba.org)
*
- * This library is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published
- * by the Free Software Foundation; either version 2.1 of the License, or
- * (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
- * the GNU Lesser General Public License for more details.
- *
*/
#ifndef _CIFS_GLOB_H
#define _CIFS_GLOB_H
@@ -630,7 +621,7 @@ struct TCP_Server_Info {
/* SMB_COM_WRITE_RAW or SMB_COM_READ_RAW. */
unsigned int capabilities; /* selective disabling of caps by smb sess */
int timeAdj; /* Adjust for difference in server time zone in sec */
- __u64 CurrentMid; /* multiplex id - rotating counter */
+ __u64 CurrentMid; /* multiplex id - rotating counter, protected by GlobalMid_Lock */
char cryptkey[CIFS_CRYPTO_KEY_SIZE]; /* used by ntlm, ntlmv2 etc */
/* 16th byte of RFC1001 workstation name is always null */
char workstation_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL];
@@ -896,7 +887,7 @@ struct cifs_ses {
struct mutex session_mutex;
struct TCP_Server_Info *server; /* pointer to server info */
int ses_count; /* reference counter */
- enum statusEnum status;
+ enum statusEnum status; /* updates protected by GlobalMid_Lock */
unsigned overrideSecFlg; /* if non-zero override global sec flags */
char *serverOS; /* name of operating system underlying server */
char *serverNOS; /* name of network operating system of server */
@@ -1093,8 +1084,7 @@ struct cifs_tcon {
struct cached_fid crfid; /* Cached root fid */
/* BB add field for back pointer to sb struct(s)? */
#ifdef CONFIG_CIFS_DFS_UPCALL
- char *dfs_path;
- int remap:2;
+ char *dfs_path; /* canonical DFS path */
struct list_head ulist; /* cache update list */
#endif
};
@@ -1257,8 +1247,7 @@ struct cifsFileInfo {
struct work_struct oplock_break; /* work for oplock breaks */
struct work_struct put; /* work for the final part of _put */
struct delayed_work deferred;
- bool oplock_break_received; /* Flag to indicate oplock break */
- bool deferred_scheduled;
+ bool deferred_close_scheduled; /* Flag to indicate close is scheduled */
};
struct cifs_io_parms {
@@ -1418,6 +1407,7 @@ struct cifsInodeInfo {
struct inode vfs_inode;
struct list_head deferred_closes; /* list of deferred closes */
spinlock_t deferred_lock; /* protection on deferred list */
+ bool lease_granted; /* Flag to indicate whether lease or oplock is granted. */
};
static inline struct cifsInodeInfo *
@@ -1795,6 +1785,8 @@ require use of the stronger protocol */
* list operations on pending_mid_q and oplockQ
* updates to XID counters, multiplex id and SMB sequence numbers
* list operations on global DnotifyReqList
+ * updates to ses->status
+ * updates to server->CurrentMid
* tcp_ses_lock protects:
* list operations on tcp and SMB session lists
* tcon->open_file_lock protects the list of open files hanging off the tcon
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index b53a87db282f..0923f72d27e9 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -1,22 +1,10 @@
+/* SPDX-License-Identifier: LGPL-2.1 */
/*
* fs/cifs/cifspdu.h
*
* Copyright (c) International Business Machines Corp., 2002,2009
* Author(s): Steve French (sfrench@us.ibm.com)
*
- * This library is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published
- * by the Free Software Foundation; either version 2.1 of the License, or
- * (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
- * the GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License
- * along with this library; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#ifndef _CIFSPDU_H
@@ -148,7 +136,8 @@
#define SMB3_SIGN_KEY_SIZE (16)
/*
- * Size of the smb3 encryption/decryption keys
+ * Size of the smb3 encryption/decryption key storage.
+ * This size is big enough to store any cipher key types.
*/
#define SMB3_ENC_DEC_KEY_SIZE (32)
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index d30cba44ba29..e0def0f0714b 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -1,22 +1,10 @@
+/* SPDX-License-Identifier: LGPL-2.1 */
/*
* fs/cifs/cifsproto.h
*
* Copyright (c) International Business Machines Corp., 2002,2008
* Author(s): Steve French (sfrench@us.ibm.com)
*
- * This library is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published
- * by the Free Software Foundation; either version 2.1 of the License, or
- * (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
- * the GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License
- * along with this library; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#ifndef _CIFSPROTO_H
#define _CIFSPROTO_H
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 41f74163cc1c..58ebec4d4413 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: LGPL-2.1
/*
* fs/cifs/cifssmb.c
*
@@ -6,19 +7,6 @@
*
* Contains the routines for constructing the SMB PDUs themselves
*
- * This library is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published
- * by the Free Software Foundation; either version 2.1 of the License, or
- * (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
- * the GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License
- * along with this library; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
/* SMB/CIFS PDU handling routines here - except for leftovers in connect.c */
@@ -1220,7 +1208,7 @@ SMBLegacyOpen(const unsigned int xid, struct cifs_tcon *tcon,
int *pOplock, FILE_ALL_INFO *pfile_info,
const struct nls_table *nls_codepage, int remap)
{
- int rc = -EACCES;
+ int rc;
OPENX_REQ *pSMB = NULL;
OPENX_RSP *pSMBr = NULL;
int bytes_returned;
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 495c395f9def..5d269f583dac 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1,22 +1,10 @@
+// SPDX-License-Identifier: LGPL-2.1
/*
* fs/cifs/connect.c
*
* Copyright (C) International Business Machines Corp., 2002,2011
* Author(s): Steve French (sfrench@us.ibm.com)
*
- * This library is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published
- * by the Free Software Foundation; either version 2.1 of the License, or
- * (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
- * the GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License
- * along with this library; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <linux/fs.h>
#include <linux/net.h>
@@ -368,13 +356,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
cifs_server_dbg(VFS, "%s: failed to update DFS target hint: rc = %d\n",
__func__, rc);
}
- rc = dfs_cache_update_vol(cifs_sb->origin_fullpath, server);
- if (rc) {
- cifs_server_dbg(VFS, "%s: failed to update vol info in DFS cache: rc = %d\n",
- __func__, rc);
- }
dfs_cache_free_tgts(&tgt_list);
-
}
cifs_put_tcp_super(sb);
@@ -1557,29 +1539,25 @@ out:
/**
* cifs_free_ipc - helper to release the session IPC tcon
*
- * Needs to be called everytime a session is destroyed
+ * Needs to be called everytime a session is destroyed.
+ *
+ * On session close, the IPC is closed and the server must release all tcons of the session.
+ * No need to send a tree disconnect here.
+ *
+ * Besides, it will make the server to not close durable and resilient files on session close, as
+ * specified in MS-SMB2 3.3.5.6 Receiving an SMB2 LOGOFF Request.
*/
static int
cifs_free_ipc(struct cifs_ses *ses)
{
- int rc = 0, xid;
struct cifs_tcon *tcon = ses->tcon_ipc;
if (tcon == NULL)
return 0;
- if (ses->server->ops->tree_disconnect) {
- xid = get_xid();
- rc = ses->server->ops->tree_disconnect(xid, tcon);
- free_xid(xid);
- }
-
- if (rc)
- cifs_dbg(FYI, "failed to disconnect IPC tcon (rc=%d)\n", rc);
-
tconInfoFree(tcon);
ses->tcon_ipc = NULL;
- return rc;
+ return 0;
}
static struct cifs_ses *
@@ -1605,7 +1583,6 @@ void cifs_put_smb_ses(struct cifs_ses *ses)
{
unsigned int rc, xid;
struct TCP_Server_Info *server = ses->server;
-
cifs_dbg(FYI, "%s: ses_count=%d\n", __func__, ses->ses_count);
spin_lock(&cifs_tcp_ses_lock);
@@ -1613,13 +1590,20 @@ void cifs_put_smb_ses(struct cifs_ses *ses)
spin_unlock(&cifs_tcp_ses_lock);
return;
}
+
+ cifs_dbg(FYI, "%s: ses_count=%d\n", __func__, ses->ses_count);
+ cifs_dbg(FYI, "%s: ses ipc: %s\n", __func__, ses->tcon_ipc ? ses->tcon_ipc->treeName : "NONE");
+
if (--ses->ses_count > 0) {
spin_unlock(&cifs_tcp_ses_lock);
return;
}
+ spin_unlock(&cifs_tcp_ses_lock);
+
+ spin_lock(&GlobalMid_Lock);
if (ses->status == CifsGood)
ses->status = CifsExiting;
- spin_unlock(&cifs_tcp_ses_lock);
+ spin_unlock(&GlobalMid_Lock);
cifs_free_ipc(ses);
@@ -1951,10 +1935,7 @@ cifs_find_tcon(struct cifs_ses *ses, struct smb3_fs_context *ctx)
spin_lock(&cifs_tcp_ses_lock);
list_for_each(tmp, &ses->tcon_list) {
tcon = list_entry(tmp, struct cifs_tcon, tcon_list);
-#ifdef CONFIG_CIFS_DFS_UPCALL
- if (tcon->dfs_path)
- continue;
-#endif
+
if (!match_tcon(tcon, ctx))
continue;
++tcon->tc_count;
@@ -3017,9 +2998,8 @@ expand_dfs_referral(const unsigned int xid, struct cifs_ses *ses,
return rc;
}
-static inline int get_next_dfs_tgt(const char *path,
- struct dfs_cache_tgt_list *tgt_list,
- struct dfs_cache_tgt_iterator **tgt_it)
+static int get_next_dfs_tgt(struct dfs_cache_tgt_list *tgt_list,
+ struct dfs_cache_tgt_iterator **tgt_it)
{
if (!*tgt_it)
*tgt_it = dfs_cache_get_tgt_iterator(tgt_list);
@@ -3059,6 +3039,7 @@ static int do_dfs_failover(const char *path, const char *full_path, struct cifs_
struct cifs_ses **ses, struct cifs_tcon **tcon)
{
int rc;
+ char *npath = NULL;
struct dfs_cache_tgt_list tgt_list = {0};
struct dfs_cache_tgt_iterator *tgt_it = NULL;
struct smb3_fs_context tmp_ctx = {NULL};
@@ -3066,11 +3047,15 @@ static int do_dfs_failover(const char *path, const char *full_path, struct cifs_
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_DFS)
return -EOPNOTSUPP;
- cifs_dbg(FYI, "%s: path=%s full_path=%s\n", __func__, path, full_path);
+ npath = dfs_cache_canonical_path(path, cifs_sb->local_nls, cifs_remap(cifs_sb));
+ if (IS_ERR(npath))
+ return PTR_ERR(npath);
+
+ cifs_dbg(FYI, "%s: path=%s full_path=%s\n", __func__, npath, full_path);
- rc = dfs_cache_noreq_find(path, NULL, &tgt_list);
+ rc = dfs_cache_noreq_find(npath, NULL, &tgt_list);
if (rc)
- return rc;
+ goto out;
/*
* We use a 'tmp_ctx' here because we need pass it down to the mount_{get,put} functions to
* test connection against new DFS targets.
@@ -3084,11 +3069,11 @@ static int do_dfs_failover(const char *path, const char *full_path, struct cifs_
char *fake_devname = NULL, *mdata = NULL;
/* Get next DFS target server - if any */
- rc = get_next_dfs_tgt(path, &tgt_list, &tgt_it);
+ rc = get_next_dfs_tgt(&tgt_list, &tgt_it);
if (rc)
break;
- rc = dfs_cache_get_tgt_referral(path, tgt_it, &ref);
+ rc = dfs_cache_get_tgt_referral(npath, tgt_it, &ref);
if (rc)
break;
@@ -3137,6 +3122,7 @@ static int do_dfs_failover(const char *path, const char *full_path, struct cifs_
}
out:
+ kfree(npath);
smb3_cleanup_fs_context_contents(&tmp_ctx);
dfs_cache_free_tgts(&tgt_list);
return rc;
@@ -3288,25 +3274,18 @@ static int is_path_remote(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *
}
#ifdef CONFIG_CIFS_DFS_UPCALL
-static void set_root_ses(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses,
+static void set_root_ses(struct cifs_sb_info *cifs_sb, const uuid_t *mount_id, struct cifs_ses *ses,
struct cifs_ses **root_ses)
{
if (ses) {
spin_lock(&cifs_tcp_ses_lock);
ses->ses_count++;
- if (ses->tcon_ipc)
- ses->tcon_ipc->remap = cifs_remap(cifs_sb);
spin_unlock(&cifs_tcp_ses_lock);
+ dfs_cache_add_refsrv_session(mount_id, ses);
}
*root_ses = ses;
}
-static void put_root_ses(struct cifs_ses *ses)
-{
- if (ses)
- cifs_put_smb_ses(ses);
-}
-
/* Set up next dfs prefix path in @dfs_path */
static int next_dfs_prepath(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx,
const unsigned int xid, struct TCP_Server_Info *server,
@@ -3352,17 +3331,25 @@ out:
}
/* Check if resolved targets can handle any DFS referrals */
-static int is_referral_server(const char *ref_path, struct cifs_tcon *tcon, bool *ref_server)
+static int is_referral_server(const char *ref_path, struct cifs_sb_info *cifs_sb,
+ struct cifs_tcon *tcon, bool *ref_server)
{
int rc;
struct dfs_info3_param ref = {0};
+ cifs_dbg(FYI, "%s: ref_path=%s\n", __func__, ref_path);
+
if (is_tcon_dfs(tcon)) {
*ref_server = true;
} else {
- cifs_dbg(FYI, "%s: ref_path=%s\n", __func__, ref_path);
+ char *npath;
- rc = dfs_cache_noreq_find(ref_path, &ref, NULL);
+ npath = dfs_cache_canonical_path(ref_path, cifs_sb->local_nls, cifs_remap(cifs_sb));
+ if (IS_ERR(npath))
+ return PTR_ERR(npath);
+
+ rc = dfs_cache_noreq_find(npath, &ref, NULL);
+ kfree(npath);
if (rc) {
cifs_dbg(VFS, "%s: dfs_cache_noreq_find: failed (rc=%d)\n", __func__, rc);
return rc;
@@ -3386,9 +3373,9 @@ int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx)
struct cifs_ses *ses = NULL, *root_ses = NULL;
struct cifs_tcon *tcon = NULL;
int count = 0;
+ uuid_t mount_id = {0};
char *ref_path = NULL, *full_path = NULL;
char *oldmnt = NULL;
- char *mntdata = NULL;
bool ref_server = false;
rc = mount_get_conns(ctx, cifs_sb, &xid, &server, &ses, &tcon);
@@ -3411,12 +3398,9 @@ int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx)
if (rc != -EREMOTE)
goto error;
}
- /* Save mount options */
- mntdata = kstrdup(cifs_sb->ctx->mount_options, GFP_KERNEL);
- if (!mntdata) {
- rc = -ENOMEM;
- goto error;
- }
+
+ ctx->nosharesock = true;
+
/* Get path of DFS root */
ref_path = build_unc_path_to_root(ctx, cifs_sb, false);
if (IS_ERR(ref_path)) {
@@ -3425,7 +3409,8 @@ int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx)
goto error;
}
- set_root_ses(cifs_sb, ses, &root_ses);
+ uuid_gen(&mount_id);
+ set_root_ses(cifs_sb, &mount_id, ses, &root_ses);
do {
/* Save full path of last DFS path we used to resolve final target server */
kfree(full_path);
@@ -3456,13 +3441,11 @@ int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx)
continue;
/* Make sure that requests go through new root servers */
- rc = is_referral_server(ref_path + 1, tcon, &ref_server);
+ rc = is_referral_server(ref_path + 1, cifs_sb, tcon, &ref_server);
if (rc)
break;
- if (ref_server) {
- put_root_ses(root_ses);
- set_root_ses(cifs_sb, ses, &root_ses);
- }
+ if (ref_server)
+ set_root_ses(cifs_sb, &mount_id, ses, &root_ses);
/* Get next dfs path and then continue chasing them if -EREMOTE */
rc = next_dfs_prepath(cifs_sb, ctx, xid, server, tcon, &ref_path);
@@ -3471,12 +3454,10 @@ int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx)
rc = -ELOOP;
} while (rc == -EREMOTE);
- if (rc)
+ if (rc || !tcon)
goto error;
- put_root_ses(root_ses);
- root_ses = NULL;
+
kfree(ref_path);
- ref_path = NULL;
/*
* Store DFS full path in both superblock and tree connect structures.
*
@@ -3485,21 +3466,27 @@ int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx)
* links, the prefix path is included in both and may be changed during reconnect. See
* cifs_tree_connect().
*/
- cifs_sb->origin_fullpath = kstrdup(full_path, GFP_KERNEL);
- if (!cifs_sb->origin_fullpath) {
+ ref_path = dfs_cache_canonical_path(full_path, cifs_sb->local_nls, cifs_remap(cifs_sb));
+ kfree(full_path);
+ full_path = NULL;
+
+ if (IS_ERR(ref_path)) {
+ rc = PTR_ERR(ref_path);
+ ref_path = NULL;
+ goto error;
+ }
+ cifs_sb->origin_fullpath = ref_path;
+
+ ref_path = kstrdup(cifs_sb->origin_fullpath, GFP_KERNEL);
+ if (!ref_path) {
rc = -ENOMEM;
goto error;
}
spin_lock(&cifs_tcp_ses_lock);
- tcon->dfs_path = full_path;
- full_path = NULL;
- tcon->remap = cifs_remap(cifs_sb);
+ tcon->dfs_path = ref_path;
+ ref_path = NULL;
spin_unlock(&cifs_tcp_ses_lock);
- /* Add original context for DFS cache to be used when refreshing referrals */
- rc = dfs_cache_add_vol(mntdata, ctx, cifs_sb->origin_fullpath);
- if (rc)
- goto error;
/*
* After reconnecting to a different server, unique ids won't
* match anymore, so we disable serverino. This prevents
@@ -3514,6 +3501,7 @@ int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx)
kfree(cifs_sb->prepath);
cifs_sb->prepath = ctx->prepath;
ctx->prepath = NULL;
+ uuid_copy(&cifs_sb->dfs_mount_id, &mount_id);
out:
free_xid(xid);
@@ -3523,9 +3511,8 @@ out:
error:
kfree(ref_path);
kfree(full_path);
- kfree(mntdata);
kfree(cifs_sb->origin_fullpath);
- put_root_ses(root_ses);
+ dfs_cache_put_refsrv_sessions(&mount_id);
mount_put_conns(cifs_sb, xid, server, ses, tcon);
return rc;
}
@@ -3755,7 +3742,7 @@ cifs_umount(struct cifs_sb_info *cifs_sb)
kfree(cifs_sb->prepath);
#ifdef CONFIG_CIFS_DFS_UPCALL
- dfs_cache_del_vol(cifs_sb->origin_fullpath);
+ dfs_cache_put_refsrv_sessions(&cifs_sb->dfs_mount_id);
kfree(cifs_sb->origin_fullpath);
#endif
call_rcu(&cifs_sb->rcu, delayed_free);
diff --git a/fs/cifs/dfs_cache.c b/fs/cifs/dfs_cache.c
index b1fa30fefe1f..7c1769714609 100644
--- a/fs/cifs/dfs_cache.c
+++ b/fs/cifs/dfs_cache.c
@@ -11,6 +11,7 @@
#include <linux/proc_fs.h>
#include <linux/nls.h>
#include <linux/workqueue.h>
+#include <linux/uuid.h>
#include "cifsglob.h"
#include "smb2pdu.h"
#include "smb2proto.h"
@@ -18,15 +19,14 @@
#include "cifs_debug.h"
#include "cifs_unicode.h"
#include "smb2glob.h"
-#include "fs_context.h"
#include "dfs_cache.h"
#define CACHE_HTABLE_SIZE 32
#define CACHE_MAX_ENTRIES 64
+#define CACHE_MIN_TTL 120 /* 2 minutes */
-#define IS_INTERLINK_SET(v) ((v) & (DFSREF_REFERRAL_SERVER | \
- DFSREF_STORAGE_SERVER))
+#define IS_DFS_INTERLINK(v) (((v) & DFSREF_REFERRAL_SERVER) && !((v) & DFSREF_STORAGE_SERVER))
struct cache_dfs_tgt {
char *name;
@@ -48,14 +48,15 @@ struct cache_entry {
struct cache_dfs_tgt *tgthint;
};
-struct vol_info {
- char *fullpath;
- spinlock_t ctx_lock;
- struct smb3_fs_context ctx;
- char *mntdata;
+/* List of referral server sessions per dfs mount */
+struct mount_group {
struct list_head list;
- struct list_head rlist;
- struct kref refcnt;
+ uuid_t id;
+ struct cifs_ses *sessions[CACHE_MAX_ENTRIES];
+ int num_sessions;
+ spinlock_t lock;
+ struct list_head refresh_list;
+ struct kref refcount;
};
static struct kmem_cache *cache_slab __read_mostly;
@@ -64,7 +65,7 @@ static struct workqueue_struct *dfscache_wq __read_mostly;
static int cache_ttl;
static DEFINE_SPINLOCK(cache_ttl_lock);
-static struct nls_table *cache_nlsc;
+static struct nls_table *cache_cp;
/*
* Number of entries in the cache
@@ -74,34 +75,145 @@ static atomic_t cache_count;
static struct hlist_head cache_htable[CACHE_HTABLE_SIZE];
static DECLARE_RWSEM(htable_rw_lock);
-static LIST_HEAD(vol_list);
-static DEFINE_SPINLOCK(vol_list_lock);
+static LIST_HEAD(mount_group_list);
+static DEFINE_MUTEX(mount_group_list_lock);
static void refresh_cache_worker(struct work_struct *work);
static DECLARE_DELAYED_WORK(refresh_task, refresh_cache_worker);
-static int get_normalized_path(const char *path, const char **npath)
+static void get_ipc_unc(const char *ref_path, char *ipc, size_t ipclen)
{
- if (!path || strlen(path) < 3 || (*path != '\\' && *path != '/'))
- return -EINVAL;
+ const char *host;
+ size_t len;
- if (*path == '\\') {
- *npath = path;
- } else {
- char *s = kstrdup(path, GFP_KERNEL);
- if (!s)
- return -ENOMEM;
- convert_delimiter(s, '\\');
- *npath = s;
+ extract_unc_hostname(ref_path, &host, &len);
+ scnprintf(ipc, ipclen, "\\\\%.*s\\IPC$", (int)len, host);
+}
+
+static struct cifs_ses *find_ipc_from_server_path(struct cifs_ses **ses, const char *path)
+{
+ char unc[SERVER_NAME_LENGTH + sizeof("//x/IPC$")] = {0};
+
+ get_ipc_unc(path, unc, sizeof(unc));
+ for (; *ses; ses++) {
+ if (!strcasecmp(unc, (*ses)->tcon_ipc->treeName))
+ return *ses;
}
- return 0;
+ return ERR_PTR(-ENOENT);
+}
+
+static void __mount_group_release(struct mount_group *mg)
+{
+ int i;
+
+ for (i = 0; i < mg->num_sessions; i++)
+ cifs_put_smb_ses(mg->sessions[i]);
+ kfree(mg);
+}
+
+static void mount_group_release(struct kref *kref)
+{
+ struct mount_group *mg = container_of(kref, struct mount_group, refcount);
+
+ mutex_lock(&mount_group_list_lock);
+ list_del(&mg->list);
+ mutex_unlock(&mount_group_list_lock);
+ __mount_group_release(mg);
+}
+
+static struct mount_group *find_mount_group_locked(const uuid_t *id)
+{
+ struct mount_group *mg;
+
+ list_for_each_entry(mg, &mount_group_list, list) {
+ if (uuid_equal(&mg->id, id))
+ return mg;
+ }
+ return ERR_PTR(-ENOENT);
+}
+
+static struct mount_group *__get_mount_group_locked(const uuid_t *id)
+{
+ struct mount_group *mg;
+
+ mg = find_mount_group_locked(id);
+ if (!IS_ERR(mg))
+ return mg;
+
+ mg = kmalloc(sizeof(*mg), GFP_KERNEL);
+ if (!mg)
+ return ERR_PTR(-ENOMEM);
+ kref_init(&mg->refcount);
+ uuid_copy(&mg->id, id);
+ mg->num_sessions = 0;
+ spin_lock_init(&mg->lock);
+ list_add(&mg->list, &mount_group_list);
+ return mg;
+}
+
+static struct mount_group *get_mount_group(const uuid_t *id)
+{
+ struct mount_group *mg;
+
+ mutex_lock(&mount_group_list_lock);
+ mg = __get_mount_group_locked(id);
+ if (!IS_ERR(mg))
+ kref_get(&mg->refcount);
+ mutex_unlock(&mount_group_list_lock);
+
+ return mg;
}
-static inline void free_normalized_path(const char *path, const char *npath)
+static void free_mount_group_list(void)
{
- if (path != npath)
- kfree(npath);
+ struct mount_group *mg, *tmp_mg;
+
+ list_for_each_entry_safe(mg, tmp_mg, &mount_group_list, list) {
+ list_del_init(&mg->list);
+ __mount_group_release(mg);
+ }
+}
+
+/**
+ * dfs_cache_canonical_path - get a canonical DFS path
+ *
+ * @path: DFS path
+ * @cp: codepage
+ * @remap: mapping type
+ *
+ * Return canonical path if success, otherwise error.
+ */
+char *dfs_cache_canonical_path(const char *path, const struct nls_table *cp, int remap)
+{
+ char *tmp;
+ int plen = 0;
+ char *npath;
+
+ if (!path || strlen(path) < 3 || (*path != '\\' && *path != '/'))
+ return ERR_PTR(-EINVAL);
+
+ if (unlikely(strcmp(cp->charset, cache_cp->charset))) {
+ tmp = (char *)cifs_strndup_to_utf16(path, strlen(path), &plen, cp, remap);
+ if (!tmp) {
+ cifs_dbg(VFS, "%s: failed to convert path to utf16\n", __func__);
+ return ERR_PTR(-EINVAL);
+ }
+
+ npath = cifs_strndup_from_utf16(tmp, plen, true, cache_cp);
+ kfree(tmp);
+
+ if (!npath) {
+ cifs_dbg(VFS, "%s: failed to convert path from utf16\n", __func__);
+ return ERR_PTR(-EINVAL);
+ }
+ } else {
+ npath = kstrdup(path, GFP_KERNEL);
+ if (!npath)
+ return ERR_PTR(-ENOMEM);
+ }
+ convert_delimiter(npath, '\\');
+ return npath;
}
static inline bool cache_entry_expired(const struct cache_entry *ce)
@@ -171,7 +283,7 @@ static int dfscache_proc_show(struct seq_file *m, void *v)
"cache entry: path=%s,type=%s,ttl=%d,etime=%ld,hdr_flags=0x%x,ref_flags=0x%x,interlink=%s,path_consumed=%d,expired=%s\n",
ce->path, ce->srvtype == DFS_TYPE_ROOT ? "root" : "link",
ce->ttl, ce->etime.tv_nsec, ce->ref_flags, ce->hdr_flags,
- IS_INTERLINK_SET(ce->hdr_flags) ? "yes" : "no",
+ IS_DFS_INTERLINK(ce->hdr_flags) ? "yes" : "no",
ce->path_consumed, cache_entry_expired(ce) ? "yes" : "no");
list_for_each_entry(t, &ce->tlist, list) {
@@ -240,7 +352,7 @@ static inline void dump_ce(const struct cache_entry *ce)
ce->srvtype == DFS_TYPE_ROOT ? "root" : "link", ce->ttl,
ce->etime.tv_nsec,
ce->hdr_flags, ce->ref_flags,
- IS_INTERLINK_SET(ce->hdr_flags) ? "yes" : "no",
+ IS_DFS_INTERLINK(ce->hdr_flags) ? "yes" : "no",
ce->path_consumed,
cache_entry_expired(ce) ? "yes" : "no");
dump_tgts(ce);
@@ -284,8 +396,7 @@ int dfs_cache_init(void)
int rc;
int i;
- dfscache_wq = alloc_workqueue("cifs-dfscache",
- WQ_FREEZABLE | WQ_MEM_RECLAIM, 1);
+ dfscache_wq = alloc_workqueue("cifs-dfscache", WQ_FREEZABLE | WQ_UNBOUND, 1);
if (!dfscache_wq)
return -ENOMEM;
@@ -301,7 +412,9 @@ int dfs_cache_init(void)
INIT_HLIST_HEAD(&cache_htable[i]);
atomic_set(&cache_count, 0);
- cache_nlsc = load_nls_default();
+ cache_cp = load_nls("utf8");
+ if (!cache_cp)
+ cache_cp = load_nls_default();
cifs_dbg(FYI, "%s: initialized DFS referral cache\n", __func__);
return 0;
@@ -311,23 +424,24 @@ out_destroy_wq:
return rc;
}
-static inline unsigned int cache_entry_hash(const void *data, int size)
+static int cache_entry_hash(const void *data, int size, unsigned int *hash)
{
- unsigned int h;
-
- h = jhash(data, size, 0);
- return h & (CACHE_HTABLE_SIZE - 1);
-}
-
-/* Check whether second path component of @path is SYSVOL or NETLOGON */
-static inline bool is_sysvol_or_netlogon(const char *path)
-{
- const char *s;
- char sep = path[0];
-
- s = strchr(path + 1, sep) + 1;
- return !strncasecmp(s, "sysvol", strlen("sysvol")) ||
- !strncasecmp(s, "netlogon", strlen("netlogon"));
+ int i, clen;
+ const unsigned char *s = data;
+ wchar_t c;
+ unsigned int h = 0;
+
+ for (i = 0; i < size; i += clen) {
+ clen = cache_cp->char2uni(&s[i], size - i, &c);
+ if (unlikely(clen < 0)) {
+ cifs_dbg(VFS, "%s: can't convert char\n", __func__);
+ return clen;
+ }
+ c = cifs_toupper(c);
+ h = jhash(&c, sizeof(c), h);
+ }
+ *hash = h % CACHE_HTABLE_SIZE;
+ return 0;
}
/* Return target hint of a DFS cache entry */
@@ -378,7 +492,7 @@ static int copy_ref_data(const struct dfs_info3_param *refs, int numrefs,
{
int i;
- ce->ttl = refs[0].ttl;
+ ce->ttl = max_t(int, refs[0].ttl, CACHE_MIN_TTL);
ce->etime = get_expire_time(ce->ttl);
ce->srvtype = refs[0].server_type;
ce->hdr_flags = refs[0].flags;
@@ -409,9 +523,7 @@ static int copy_ref_data(const struct dfs_info3_param *refs, int numrefs,
}
/* Allocate a new cache entry */
-static struct cache_entry *alloc_cache_entry(const char *path,
- const struct dfs_info3_param *refs,
- int numrefs)
+static struct cache_entry *alloc_cache_entry(struct dfs_info3_param *refs, int numrefs)
{
struct cache_entry *ce;
int rc;
@@ -420,11 +532,9 @@ static struct cache_entry *alloc_cache_entry(const char *path,
if (!ce)
return ERR_PTR(-ENOMEM);
- ce->path = kstrdup(path, GFP_KERNEL);
- if (!ce->path) {
- kmem_cache_free(cache_slab, ce);
- return ERR_PTR(-ENOMEM);
- }
+ ce->path = refs[0].path_name;
+ refs[0].path_name = NULL;
+
INIT_HLIST_NODE(&ce->hlist);
INIT_LIST_HEAD(&ce->tlist);
@@ -437,13 +547,14 @@ static struct cache_entry *alloc_cache_entry(const char *path,
return ce;
}
-/* Must be called with htable_rw_lock held */
-static void remove_oldest_entry(void)
+static void remove_oldest_entry_locked(void)
{
int i;
struct cache_entry *ce;
struct cache_entry *to_del = NULL;
+ WARN_ON(!rwsem_is_locked(&htable_rw_lock));
+
for (i = 0; i < CACHE_HTABLE_SIZE; i++) {
struct hlist_head *l = &cache_htable[i];
@@ -467,12 +578,24 @@ static void remove_oldest_entry(void)
}
/* Add a new DFS cache entry */
-static int add_cache_entry(const char *path, unsigned int hash,
- struct dfs_info3_param *refs, int numrefs)
+static int add_cache_entry_locked(struct dfs_info3_param *refs, int numrefs)
{
+ int rc;
struct cache_entry *ce;
+ unsigned int hash;
+
+ WARN_ON(!rwsem_is_locked(&htable_rw_lock));
+
+ if (atomic_read(&cache_count) >= CACHE_MAX_ENTRIES) {
+ cifs_dbg(FYI, "%s: reached max cache size (%d)\n", __func__, CACHE_MAX_ENTRIES);
+ remove_oldest_entry_locked();
+ }
- ce = alloc_cache_entry(path, refs, numrefs);
+ rc = cache_entry_hash(refs[0].path_name, strlen(refs[0].path_name), &hash);
+ if (rc)
+ return rc;
+
+ ce = alloc_cache_entry(refs, numrefs);
if (IS_ERR(ce))
return PTR_ERR(ce);
@@ -486,65 +609,77 @@ static int add_cache_entry(const char *path, unsigned int hash,
}
spin_unlock(&cache_ttl_lock);
- down_write(&htable_rw_lock);
hlist_add_head(&ce->hlist, &cache_htable[hash]);
dump_ce(ce);
- up_write(&htable_rw_lock);
+
+ atomic_inc(&cache_count);
return 0;
}
-static struct cache_entry *__lookup_cache_entry(const char *path)
+/* Check if two DFS paths are equal. @s1 and @s2 are expected to be in @cache_cp's charset */
+static bool dfs_path_equal(const char *s1, int len1, const char *s2, int len2)
{
- struct cache_entry *ce;
- unsigned int h;
- bool found = false;
+ int i, l1, l2;
+ wchar_t c1, c2;
- h = cache_entry_hash(path, strlen(path));
+ if (len1 != len2)
+ return false;
- hlist_for_each_entry(ce, &cache_htable[h], hlist) {
- if (!strcasecmp(path, ce->path)) {
- found = true;
- dump_ce(ce);
- break;
+ for (i = 0; i < len1; i += l1) {
+ l1 = cache_cp->char2uni(&s1[i], len1 - i, &c1);
+ l2 = cache_cp->char2uni(&s2[i], len2 - i, &c2);
+ if (unlikely(l1 < 0 && l2 < 0)) {
+ if (s1[i] != s2[i])
+ return false;
+ l1 = 1;
+ continue;
}
+ if (l1 != l2)
+ return false;
+ if (cifs_toupper(c1) != cifs_toupper(c2))
+ return false;
}
+ return true;
+}
- if (!found)
- ce = ERR_PTR(-ENOENT);
- return ce;
+static struct cache_entry *__lookup_cache_entry(const char *path, unsigned int hash, int len)
+{
+ struct cache_entry *ce;
+
+ hlist_for_each_entry(ce, &cache_htable[hash], hlist) {
+ if (dfs_path_equal(ce->path, strlen(ce->path), path, len)) {
+ dump_ce(ce);
+ return ce;
+ }
+ }
+ return ERR_PTR(-EEXIST);
}
/*
- * Find a DFS cache entry in hash table and optionally check prefix path against
- * @path.
- * Use whole path components in the match.
- * Must be called with htable_rw_lock held.
+ * Find a DFS cache entry in hash table and optionally check prefix path against normalized @path.
+ *
+ * Use whole path components in the match. Must be called with htable_rw_lock held.
*
- * Return ERR_PTR(-ENOENT) if the entry is not found.
+ * Return ERR_PTR(-EEXIST) if the entry is not found.
*/
-static struct cache_entry *lookup_cache_entry(const char *path, unsigned int *hash)
+static struct cache_entry *lookup_cache_entry(const char *path)
{
- struct cache_entry *ce = ERR_PTR(-ENOENT);
- unsigned int h;
+ struct cache_entry *ce;
int cnt = 0;
- char *npath;
- char *s, *e;
- char sep;
-
- npath = kstrdup(path, GFP_KERNEL);
- if (!npath)
- return ERR_PTR(-ENOMEM);
+ const char *s = path, *e;
+ char sep = *s;
+ unsigned int hash;
+ int rc;
- s = npath;
- sep = *npath;
while ((s = strchr(s, sep)) && ++cnt < 3)
s++;
if (cnt < 3) {
- h = cache_entry_hash(path, strlen(path));
- ce = __lookup_cache_entry(path);
- goto out;
+ rc = cache_entry_hash(path, strlen(path), &hash);
+ if (rc)
+ return ERR_PTR(rc);
+ return __lookup_cache_entry(path, hash, strlen(path));
}
/*
* Handle paths that have more than two path components and are a complete prefix of the DFS
@@ -552,64 +687,29 @@ static struct cache_entry *lookup_cache_entry(const char *path, unsigned int *ha
*
* See MS-DFSC 3.2.5.5 "Receiving a Root Referral Request or Link Referral Request".
*/
- h = cache_entry_hash(npath, strlen(npath));
- e = npath + strlen(npath) - 1;
+ e = path + strlen(path) - 1;
while (e > s) {
- char tmp;
+ int len;
/* skip separators */
while (e > s && *e == sep)
e--;
if (e == s)
- goto out;
-
- tmp = *(e+1);
- *(e+1) = 0;
-
- ce = __lookup_cache_entry(npath);
- if (!IS_ERR(ce)) {
- h = cache_entry_hash(npath, strlen(npath));
break;
- }
- *(e+1) = tmp;
+ len = e + 1 - path;
+ rc = cache_entry_hash(path, len, &hash);
+ if (rc)
+ return ERR_PTR(rc);
+ ce = __lookup_cache_entry(path, hash, len);
+ if (!IS_ERR(ce))
+ return ce;
+
/* backward until separator */
while (e > s && *e != sep)
e--;
}
-out:
- if (hash)
- *hash = h;
- kfree(npath);
- return ce;
-}
-
-static void __vol_release(struct vol_info *vi)
-{
- kfree(vi->fullpath);
- kfree(vi->mntdata);
- smb3_cleanup_fs_context_contents(&vi->ctx);
- kfree(vi);
-}
-
-static void vol_release(struct kref *kref)
-{
- struct vol_info *vi = container_of(kref, struct vol_info, refcnt);
-
- spin_lock(&vol_list_lock);
- list_del(&vi->list);
- spin_unlock(&vol_list_lock);
- __vol_release(vi);
-}
-
-static inline void free_vol_list(void)
-{
- struct vol_info *vi, *nvi;
-
- list_for_each_entry_safe(vi, nvi, &vol_list, list) {
- list_del_init(&vi->list);
- __vol_release(vi);
- }
+ return ERR_PTR(-EEXIST);
}
/**
@@ -618,8 +718,8 @@ static inline void free_vol_list(void)
void dfs_cache_destroy(void)
{
cancel_delayed_work_sync(&refresh_task);
- unload_nls(cache_nlsc);
- free_vol_list();
+ unload_nls(cache_cp);
+ free_mount_group_list();
flush_cache_ents();
kmem_cache_destroy(cache_slab);
destroy_workqueue(dfscache_wq);
@@ -627,18 +727,14 @@ void dfs_cache_destroy(void)
cifs_dbg(FYI, "%s: destroyed DFS referral cache\n", __func__);
}
-/* Must be called with htable_rw_lock held */
-static int __update_cache_entry(const char *path,
- const struct dfs_info3_param *refs,
- int numrefs)
+/* Update a cache entry with the new referral in @refs */
+static int update_cache_entry_locked(struct cache_entry *ce, const struct dfs_info3_param *refs,
+ int numrefs)
{
int rc;
- struct cache_entry *ce;
char *s, *th = NULL;
- ce = lookup_cache_entry(path, NULL);
- if (IS_ERR(ce))
- return PTR_ERR(ce);
+ WARN_ON(!rwsem_is_locked(&htable_rw_lock));
if (ce->tgthint) {
s = ce->tgthint->name;
@@ -657,37 +753,30 @@ static int __update_cache_entry(const char *path,
return rc;
}
-static int get_dfs_referral(const unsigned int xid, struct cifs_ses *ses,
- const struct nls_table *nls_codepage, int remap,
- const char *path, struct dfs_info3_param **refs,
- int *numrefs)
+static int get_dfs_referral(const unsigned int xid, struct cifs_ses *ses, const char *path,
+ struct dfs_info3_param **refs, int *numrefs)
{
- cifs_dbg(FYI, "%s: get an DFS referral for %s\n", __func__, path);
+ int rc;
+ int i;
- if (!ses || !ses->server || !ses->server->ops->get_dfs_refer)
- return -EOPNOTSUPP;
- if (unlikely(!nls_codepage))
- return -EINVAL;
+ cifs_dbg(FYI, "%s: get an DFS referral for %s\n", __func__, path);
*refs = NULL;
*numrefs = 0;
- return ses->server->ops->get_dfs_refer(xid, ses, path, refs, numrefs,
- nls_codepage, remap);
-}
-
-/* Update an expired cache entry by getting a new DFS referral from server */
-static int update_cache_entry(const char *path,
- const struct dfs_info3_param *refs,
- int numrefs)
-{
-
- int rc;
+ if (!ses || !ses->server || !ses->server->ops->get_dfs_refer)
+ return -EOPNOTSUPP;
+ if (unlikely(!cache_cp))
+ return -EINVAL;
- down_write(&htable_rw_lock);
- rc = __update_cache_entry(path, refs, numrefs);
- up_write(&htable_rw_lock);
+ rc = ses->server->ops->get_dfs_refer(xid, ses, path, refs, numrefs, cache_cp,
+ NO_MAP_UNI_RSVD);
+ if (!rc) {
+ struct dfs_info3_param *ref = *refs;
+ for (i = 0; i < *numrefs; i++)
+ convert_delimiter(ref[i].path_name, '\\');
+ }
return rc;
}
@@ -697,15 +786,12 @@ static int update_cache_entry(const char *path,
* If the entry wasn't found, it will create a new one. Or if it was found but
* expired, then it will update the entry accordingly.
*
- * For interlinks, __cifs_dfs_mount() and expand_dfs_referral() are supposed to
+ * For interlinks, cifs_mount() and expand_dfs_referral() are supposed to
* handle them properly.
*/
-static int __dfs_cache_find(const unsigned int xid, struct cifs_ses *ses,
- const struct nls_table *nls_codepage, int remap,
- const char *path, bool noreq)
+static int cache_refresh_path(const unsigned int xid, struct cifs_ses *ses, const char *path)
{
int rc;
- unsigned int hash;
struct cache_entry *ce;
struct dfs_info3_param *refs = NULL;
int numrefs = 0;
@@ -713,62 +799,38 @@ static int __dfs_cache_find(const unsigned int xid, struct cifs_ses *ses,
cifs_dbg(FYI, "%s: search path: %s\n", __func__, path);
- down_read(&htable_rw_lock);
-
- ce = lookup_cache_entry(path, &hash);
-
- /*
- * If @noreq is set, no requests will be sent to the server. Just return
- * the cache entry.
- */
- if (noreq) {
- up_read(&htable_rw_lock);
- return PTR_ERR_OR_ZERO(ce);
- }
+ down_write(&htable_rw_lock);
+ ce = lookup_cache_entry(path);
if (!IS_ERR(ce)) {
if (!cache_entry_expired(ce)) {
dump_ce(ce);
- up_read(&htable_rw_lock);
+ up_write(&htable_rw_lock);
return 0;
}
} else {
newent = true;
}
- up_read(&htable_rw_lock);
-
/*
- * No entry was found.
- *
- * Request a new DFS referral in order to create a new cache entry, or
- * updating an existing one.
+ * Either the entry was not found, or it is expired.
+ * Request a new DFS referral in order to create or update a cache entry.
*/
- rc = get_dfs_referral(xid, ses, nls_codepage, remap, path,
- &refs, &numrefs);
+ rc = get_dfs_referral(xid, ses, path, &refs, &numrefs);
if (rc)
- return rc;
+ goto out_unlock;
dump_refs(refs, numrefs);
if (!newent) {
- rc = update_cache_entry(path, refs, numrefs);
- goto out_free_refs;
- }
-
- if (atomic_read(&cache_count) >= CACHE_MAX_ENTRIES) {
- cifs_dbg(FYI, "%s: reached max cache size (%d)\n",
- __func__, CACHE_MAX_ENTRIES);
- down_write(&htable_rw_lock);
- remove_oldest_entry();
- up_write(&htable_rw_lock);
+ rc = update_cache_entry_locked(ce, refs, numrefs);
+ goto out_unlock;
}
- rc = add_cache_entry(path, hash, refs, numrefs);
- if (!rc)
- atomic_inc(&cache_count);
+ rc = add_cache_entry_locked(refs, numrefs);
-out_free_refs:
+out_unlock:
+ up_write(&htable_rw_lock);
free_dfs_info_array(refs, numrefs);
return rc;
}
@@ -868,7 +930,7 @@ err_free_it:
* needs to be issued:
* @xid: syscall xid
* @ses: smb session to issue the request on
- * @nls_codepage: charset conversion
+ * @cp: codepage
* @remap: path character remapping type
* @path: path to lookup in DFS referral cache.
*
@@ -877,26 +939,25 @@ err_free_it:
*
* Return zero if the target was found, otherwise non-zero.
*/
-int dfs_cache_find(const unsigned int xid, struct cifs_ses *ses,
- const struct nls_table *nls_codepage, int remap,
- const char *path, struct dfs_info3_param *ref,
+int dfs_cache_find(const unsigned int xid, struct cifs_ses *ses, const struct nls_table *cp,
+ int remap, const char *path, struct dfs_info3_param *ref,
struct dfs_cache_tgt_list *tgt_list)
{
int rc;
const char *npath;
struct cache_entry *ce;
- rc = get_normalized_path(path, &npath);
- if (rc)
- return rc;
+ npath = dfs_cache_canonical_path(path, cp, remap);
+ if (IS_ERR(npath))
+ return PTR_ERR(npath);
- rc = __dfs_cache_find(xid, ses, nls_codepage, remap, npath, false);
+ rc = cache_refresh_path(xid, ses, npath);
if (rc)
goto out_free_path;
down_read(&htable_rw_lock);
- ce = lookup_cache_entry(npath, NULL);
+ ce = lookup_cache_entry(npath);
if (IS_ERR(ce)) {
up_read(&htable_rw_lock);
rc = PTR_ERR(ce);
@@ -913,7 +974,7 @@ int dfs_cache_find(const unsigned int xid, struct cifs_ses *ses,
up_read(&htable_rw_lock);
out_free_path:
- free_normalized_path(path, npath);
+ kfree(npath);
return rc;
}
@@ -925,7 +986,7 @@ out_free_path:
* expired, nor create a new cache entry if @path hasn't been found. It heavily
* relies on an existing cache entry.
*
- * @path: path to lookup in the DFS referral cache.
+ * @path: canonical DFS path to lookup in the DFS referral cache.
* @ref: when non-NULL, store single DFS referral result in it.
* @tgt_list: when non-NULL, store complete DFS target list in it.
*
@@ -937,18 +998,13 @@ int dfs_cache_noreq_find(const char *path, struct dfs_info3_param *ref,
struct dfs_cache_tgt_list *tgt_list)
{
int rc;
- const char *npath;
struct cache_entry *ce;
- rc = get_normalized_path(path, &npath);
- if (rc)
- return rc;
-
- cifs_dbg(FYI, "%s: path: %s\n", __func__, npath);
+ cifs_dbg(FYI, "%s: path: %s\n", __func__, path);
down_read(&htable_rw_lock);
- ce = lookup_cache_entry(npath, NULL);
+ ce = lookup_cache_entry(path);
if (IS_ERR(ce)) {
rc = PTR_ERR(ce);
goto out_unlock;
@@ -963,8 +1019,6 @@ int dfs_cache_noreq_find(const char *path, struct dfs_info3_param *ref,
out_unlock:
up_read(&htable_rw_lock);
- free_normalized_path(path, npath);
-
return rc;
}
@@ -979,16 +1033,15 @@ out_unlock:
*
* @xid: syscall id
* @ses: smb session
- * @nls_codepage: charset conversion
+ * @cp: codepage
* @remap: type of character remapping for paths
- * @path: path to lookup in DFS referral cache.
+ * @path: path to lookup in DFS referral cache
* @it: DFS target iterator
*
* Return zero if the target hint was updated successfully, otherwise non-zero.
*/
int dfs_cache_update_tgthint(const unsigned int xid, struct cifs_ses *ses,
- const struct nls_table *nls_codepage, int remap,
- const char *path,
+ const struct nls_table *cp, int remap, const char *path,
const struct dfs_cache_tgt_iterator *it)
{
int rc;
@@ -996,19 +1049,19 @@ int dfs_cache_update_tgthint(const unsigned int xid, struct cifs_ses *ses,
struct cache_entry *ce;
struct cache_dfs_tgt *t;
- rc = get_normalized_path(path, &npath);
- if (rc)
- return rc;
+ npath = dfs_cache_canonical_path(path, cp, remap);
+ if (IS_ERR(npath))
+ return PTR_ERR(npath);
cifs_dbg(FYI, "%s: update target hint - path: %s\n", __func__, npath);
- rc = __dfs_cache_find(xid, ses, nls_codepage, remap, npath, false);
+ rc = cache_refresh_path(xid, ses, npath);
if (rc)
goto out_free_path;
down_write(&htable_rw_lock);
- ce = lookup_cache_entry(npath, NULL);
+ ce = lookup_cache_entry(npath);
if (IS_ERR(ce)) {
rc = PTR_ERR(ce);
goto out_unlock;
@@ -1031,8 +1084,7 @@ int dfs_cache_update_tgthint(const unsigned int xid, struct cifs_ses *ses,
out_unlock:
up_write(&htable_rw_lock);
out_free_path:
- free_normalized_path(path, npath);
-
+ kfree(npath);
return rc;
}
@@ -1044,32 +1096,26 @@ out_free_path:
* expired, nor create a new cache entry if @path hasn't been found. It heavily
* relies on an existing cache entry.
*
- * @path: path to lookup in DFS referral cache.
+ * @path: canonical DFS path to lookup in DFS referral cache.
* @it: target iterator which contains the target hint to update the cache
* entry with.
*
* Return zero if the target hint was updated successfully, otherwise non-zero.
*/
-int dfs_cache_noreq_update_tgthint(const char *path,
- const struct dfs_cache_tgt_iterator *it)
+int dfs_cache_noreq_update_tgthint(const char *path, const struct dfs_cache_tgt_iterator *it)
{
int rc;
- const char *npath;
struct cache_entry *ce;
struct cache_dfs_tgt *t;
if (!it)
return -EINVAL;
- rc = get_normalized_path(path, &npath);
- if (rc)
- return rc;
-
- cifs_dbg(FYI, "%s: path: %s\n", __func__, npath);
+ cifs_dbg(FYI, "%s: path: %s\n", __func__, path);
down_write(&htable_rw_lock);
- ce = lookup_cache_entry(npath, NULL);
+ ce = lookup_cache_entry(path);
if (IS_ERR(ce)) {
rc = PTR_ERR(ce);
goto out_unlock;
@@ -1092,8 +1138,6 @@ int dfs_cache_noreq_update_tgthint(const char *path,
out_unlock:
up_write(&htable_rw_lock);
- free_normalized_path(path, npath);
-
return rc;
}
@@ -1101,32 +1145,26 @@ out_unlock:
* dfs_cache_get_tgt_referral - returns a DFS referral (@ref) from a given
* target iterator (@it).
*
- * @path: path to lookup in DFS referral cache.
+ * @path: canonical DFS path to lookup in DFS referral cache.
* @it: DFS target iterator.
* @ref: DFS referral pointer to set up the gathered information.
*
* Return zero if the DFS referral was set up correctly, otherwise non-zero.
*/
-int dfs_cache_get_tgt_referral(const char *path,
- const struct dfs_cache_tgt_iterator *it,
+int dfs_cache_get_tgt_referral(const char *path, const struct dfs_cache_tgt_iterator *it,
struct dfs_info3_param *ref)
{
int rc;
- const char *npath;
struct cache_entry *ce;
if (!it || !ref)
return -EINVAL;
- rc = get_normalized_path(path, &npath);
- if (rc)
- return rc;
-
- cifs_dbg(FYI, "%s: path: %s\n", __func__, npath);
+ cifs_dbg(FYI, "%s: path: %s\n", __func__, path);
down_read(&htable_rw_lock);
- ce = lookup_cache_entry(npath, NULL);
+ ce = lookup_cache_entry(path);
if (IS_ERR(ce)) {
rc = PTR_ERR(ce);
goto out_unlock;
@@ -1138,132 +1176,55 @@ int dfs_cache_get_tgt_referral(const char *path,
out_unlock:
up_read(&htable_rw_lock);
- free_normalized_path(path, npath);
-
return rc;
}
/**
- * dfs_cache_add_vol - add a cifs context during mount() that will be handled by
- * DFS cache refresh worker.
- *
- * @mntdata: mount data.
- * @ctx: cifs context.
- * @fullpath: origin full path.
+ * dfs_cache_add_refsrv_session - add SMB session of referral server
*
- * Return zero if context was set up correctly, otherwise non-zero.
+ * @mount_id: mount group uuid to lookup.
+ * @ses: reference counted SMB session of referral server.
*/
-int dfs_cache_add_vol(char *mntdata, struct smb3_fs_context *ctx, const char *fullpath)
+void dfs_cache_add_refsrv_session(const uuid_t *mount_id, struct cifs_ses *ses)
{
- int rc;
- struct vol_info *vi;
-
- if (!ctx || !fullpath || !mntdata)
- return -EINVAL;
-
- cifs_dbg(FYI, "%s: fullpath: %s\n", __func__, fullpath);
-
- vi = kzalloc(sizeof(*vi), GFP_KERNEL);
- if (!vi)
- return -ENOMEM;
+ struct mount_group *mg;
- vi->fullpath = kstrdup(fullpath, GFP_KERNEL);
- if (!vi->fullpath) {
- rc = -ENOMEM;
- goto err_free_vi;
- }
-
- rc = smb3_fs_context_dup(&vi->ctx, ctx);
- if (rc)
- goto err_free_fullpath;
-
- vi->mntdata = mntdata;
- spin_lock_init(&vi->ctx_lock);
- kref_init(&vi->refcnt);
-
- spin_lock(&vol_list_lock);
- list_add_tail(&vi->list, &vol_list);
- spin_unlock(&vol_list_lock);
-
- return 0;
-
-err_free_fullpath:
- kfree(vi->fullpath);
-err_free_vi:
- kfree(vi);
- return rc;
-}
+ if (WARN_ON_ONCE(!mount_id || uuid_is_null(mount_id) || !ses))
+ return;
-/* Must be called with vol_list_lock held */
-static struct vol_info *find_vol(const char *fullpath)
-{
- struct vol_info *vi;
+ mg = get_mount_group(mount_id);
+ if (WARN_ON_ONCE(IS_ERR(mg)))
+ return;
- list_for_each_entry(vi, &vol_list, list) {
- cifs_dbg(FYI, "%s: vi->fullpath: %s\n", __func__, vi->fullpath);
- if (!strcasecmp(vi->fullpath, fullpath))
- return vi;
- }
- return ERR_PTR(-ENOENT);
+ spin_lock(&mg->lock);
+ if (mg->num_sessions < ARRAY_SIZE(mg->sessions))
+ mg->sessions[mg->num_sessions++] = ses;
+ spin_unlock(&mg->lock);
+ kref_put(&mg->refcount, mount_group_release);
}
/**
- * dfs_cache_update_vol - update vol info in DFS cache after failover
+ * dfs_cache_put_refsrv_sessions - put all referral server sessions
*
- * @fullpath: fullpath to look up in volume list.
- * @server: TCP ses pointer.
+ * Put all SMB sessions from the given mount group id.
*
- * Return zero if volume was updated, otherwise non-zero.
+ * @mount_id: mount group uuid to lookup.
*/
-int dfs_cache_update_vol(const char *fullpath, struct TCP_Server_Info *server)
+void dfs_cache_put_refsrv_sessions(const uuid_t *mount_id)
{
- struct vol_info *vi;
-
- if (!fullpath || !server)
- return -EINVAL;
-
- cifs_dbg(FYI, "%s: fullpath: %s\n", __func__, fullpath);
-
- spin_lock(&vol_list_lock);
- vi = find_vol(fullpath);
- if (IS_ERR(vi)) {
- spin_unlock(&vol_list_lock);
- return PTR_ERR(vi);
- }
- kref_get(&vi->refcnt);
- spin_unlock(&vol_list_lock);
-
- cifs_dbg(FYI, "%s: updating volume info\n", __func__);
- spin_lock(&vi->ctx_lock);
- memcpy(&vi->ctx.dstaddr, &server->dstaddr,
- sizeof(vi->ctx.dstaddr));
- spin_unlock(&vi->ctx_lock);
+ struct mount_group *mg;
- kref_put(&vi->refcnt, vol_release);
-
- return 0;
-}
-
-/**
- * dfs_cache_del_vol - remove volume info in DFS cache during umount()
- *
- * @fullpath: fullpath to look up in volume list.
- */
-void dfs_cache_del_vol(const char *fullpath)
-{
- struct vol_info *vi;
-
- if (!fullpath || !*fullpath)
+ if (!mount_id || uuid_is_null(mount_id))
return;
- cifs_dbg(FYI, "%s: fullpath: %s\n", __func__, fullpath);
-
- spin_lock(&vol_list_lock);
- vi = find_vol(fullpath);
- spin_unlock(&vol_list_lock);
-
- if (!IS_ERR(vi))
- kref_put(&vi->refcnt, vol_release);
+ mutex_lock(&mount_group_list_lock);
+ mg = find_mount_group_locked(mount_id);
+ if (IS_ERR(mg)) {
+ mutex_unlock(&mount_group_list_lock);
+ return;
+ }
+ mutex_unlock(&mount_group_list_lock);
+ kref_put(&mg->refcount, mount_group_release);
}
/**
@@ -1276,8 +1237,8 @@ void dfs_cache_del_vol(const char *fullpath)
*
* Return zero if target was parsed correctly, otherwise non-zero.
*/
-int dfs_cache_get_tgt_share(char *path, const struct dfs_cache_tgt_iterator *it,
- char **share, char **prefix)
+int dfs_cache_get_tgt_share(char *path, const struct dfs_cache_tgt_iterator *it, char **share,
+ char **prefix)
{
char *s, sep, *p;
size_t len;
@@ -1332,278 +1293,190 @@ int dfs_cache_get_tgt_share(char *path, const struct dfs_cache_tgt_iterator *it,
return 0;
}
-/* Get all tcons that are within a DFS namespace and can be refreshed */
-static void get_tcons(struct TCP_Server_Info *server, struct list_head *head)
+/*
+ * Refresh all active dfs mounts regardless of whether they are in cache or not.
+ * (cache can be cleared)
+ */
+static void refresh_mounts(struct cifs_ses **sessions)
{
+ struct TCP_Server_Info *server;
struct cifs_ses *ses;
- struct cifs_tcon *tcon;
+ struct cifs_tcon *tcon, *ntcon;
+ struct list_head tcons;
+ unsigned int xid;
- INIT_LIST_HEAD(head);
+ INIT_LIST_HEAD(&tcons);
spin_lock(&cifs_tcp_ses_lock);
- list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
- list_for_each_entry(tcon, &ses->tcon_list, tcon_list) {
- if (!tcon->need_reconnect && !tcon->need_reopen_files &&
- tcon->dfs_path) {
- tcon->tc_count++;
- list_add_tail(&tcon->ulist, head);
+ list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) {
+ list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
+ list_for_each_entry(tcon, &ses->tcon_list, tcon_list) {
+ if (tcon->dfs_path) {
+ tcon->tc_count++;
+ list_add_tail(&tcon->ulist, &tcons);
+ }
}
}
- if (ses->tcon_ipc && !ses->tcon_ipc->need_reconnect &&
- ses->tcon_ipc->dfs_path) {
- list_add_tail(&ses->tcon_ipc->ulist, head);
- }
}
spin_unlock(&cifs_tcp_ses_lock);
-}
-static bool is_dfs_link(const char *path)
-{
- char *s;
-
- s = strchr(path + 1, '\\');
- if (!s)
- return false;
- return !!strchr(s + 1, '\\');
-}
-
-static char *get_dfs_root(const char *path)
-{
- char *s, *npath;
-
- s = strchr(path + 1, '\\');
- if (!s)
- return ERR_PTR(-EINVAL);
-
- s = strchr(s + 1, '\\');
- if (!s)
- return ERR_PTR(-EINVAL);
-
- npath = kstrndup(path, s - path, GFP_KERNEL);
- if (!npath)
- return ERR_PTR(-ENOMEM);
+ list_for_each_entry_safe(tcon, ntcon, &tcons, ulist) {
+ const char *path = tcon->dfs_path + 1;
+ struct cache_entry *ce;
+ struct dfs_info3_param *refs = NULL;
+ int numrefs = 0;
+ bool needs_refresh = false;
+ int rc = 0;
- return npath;
-}
+ list_del_init(&tcon->ulist);
-static inline void put_tcp_server(struct TCP_Server_Info *server)
-{
- cifs_put_tcp_session(server, 0);
-}
+ ses = find_ipc_from_server_path(sessions, path);
+ if (IS_ERR(ses))
+ goto next_tcon;
-static struct TCP_Server_Info *get_tcp_server(struct smb3_fs_context *ctx)
-{
- struct TCP_Server_Info *server;
+ down_read(&htable_rw_lock);
+ ce = lookup_cache_entry(path);
+ needs_refresh = IS_ERR(ce) || cache_entry_expired(ce);
+ up_read(&htable_rw_lock);
- server = cifs_find_tcp_session(ctx);
- if (IS_ERR_OR_NULL(server))
- return NULL;
+ if (!needs_refresh)
+ goto next_tcon;
+
+ xid = get_xid();
+ rc = get_dfs_referral(xid, ses, path, &refs, &numrefs);
+ free_xid(xid);
+
+ /* Create or update a cache entry with the new referral */
+ if (!rc) {
+ down_write(&htable_rw_lock);
+ ce = lookup_cache_entry(path);
+ if (IS_ERR(ce))
+ add_cache_entry_locked(refs, numrefs);
+ else if (cache_entry_expired(ce))
+ update_cache_entry_locked(ce, refs, numrefs);
+ up_write(&htable_rw_lock);
+ }
- spin_lock(&GlobalMid_Lock);
- if (server->tcpStatus != CifsGood) {
- spin_unlock(&GlobalMid_Lock);
- put_tcp_server(server);
- return NULL;
+next_tcon:
+ free_dfs_info_array(refs, numrefs);
+ cifs_put_tcon(tcon);
}
- spin_unlock(&GlobalMid_Lock);
-
- return server;
}
-/* Find root SMB session out of a DFS link path */
-static struct cifs_ses *find_root_ses(struct vol_info *vi,
- struct cifs_tcon *tcon,
- const char *path)
+static void refresh_cache(struct cifs_ses **sessions)
{
- char *rpath;
- int rc;
- struct cache_entry *ce;
- struct dfs_info3_param ref = {0};
- char *mdata = NULL, *devname = NULL;
- struct TCP_Server_Info *server;
+ int i;
struct cifs_ses *ses;
- struct smb3_fs_context ctx = {NULL};
+ unsigned int xid;
+ char *ref_paths[CACHE_MAX_ENTRIES];
+ int count = 0;
+ struct cache_entry *ce;
- rpath = get_dfs_root(path);
- if (IS_ERR(rpath))
- return ERR_CAST(rpath);
+ /*
+ * Refresh all cached entries. Get all new referrals outside critical section to avoid
+ * starvation while performing SMB2 IOCTL on broken or slow connections.
+ * The cache entries may cover more paths than the active mounts
+ * (e.g. domain-based DFS referrals or multi tier DFS setups).
+ */
down_read(&htable_rw_lock);
+ for (i = 0; i < CACHE_HTABLE_SIZE; i++) {
+ struct hlist_head *l = &cache_htable[i];
- ce = lookup_cache_entry(rpath, NULL);
- if (IS_ERR(ce)) {
- up_read(&htable_rw_lock);
- ses = ERR_CAST(ce);
- goto out;
- }
-
- rc = setup_referral(path, ce, &ref, get_tgt_name(ce));
- if (rc) {
- up_read(&htable_rw_lock);
- ses = ERR_PTR(rc);
- goto out;
+ hlist_for_each_entry(ce, l, hlist) {
+ if (count == ARRAY_SIZE(ref_paths))
+ goto out_unlock;
+ if (hlist_unhashed(&ce->hlist) || !cache_entry_expired(ce) ||
+ IS_ERR(find_ipc_from_server_path(sessions, ce->path)))
+ continue;
+ ref_paths[count++] = kstrdup(ce->path, GFP_ATOMIC);
+ }
}
+out_unlock:
up_read(&htable_rw_lock);
- mdata = cifs_compose_mount_options(vi->mntdata, rpath, &ref,
- &devname);
- free_dfs_info_param(&ref);
-
- if (IS_ERR(mdata)) {
- ses = ERR_CAST(mdata);
- mdata = NULL;
- goto out;
- }
-
- rc = cifs_setup_volume_info(&ctx, NULL, devname);
-
- if (rc) {
- ses = ERR_PTR(rc);
- goto out;
- }
-
- server = get_tcp_server(&ctx);
- if (!server) {
- ses = ERR_PTR(-EHOSTDOWN);
- goto out;
- }
-
- ses = cifs_get_smb_ses(server, &ctx);
-
-out:
- smb3_cleanup_fs_context_contents(&ctx);
- kfree(mdata);
- kfree(rpath);
- kfree(devname);
-
- return ses;
-}
-
-/* Refresh DFS cache entry from a given tcon */
-static int refresh_tcon(struct vol_info *vi, struct cifs_tcon *tcon)
-{
- int rc = 0;
- unsigned int xid;
- const char *path, *npath;
- struct cache_entry *ce;
- struct cifs_ses *root_ses = NULL, *ses;
- struct dfs_info3_param *refs = NULL;
- int numrefs = 0;
-
- xid = get_xid();
-
- path = tcon->dfs_path + 1;
+ for (i = 0; i < count; i++) {
+ char *path = ref_paths[i];
+ struct dfs_info3_param *refs = NULL;
+ int numrefs = 0;
+ int rc = 0;
- rc = get_normalized_path(path, &npath);
- if (rc)
- goto out_free_xid;
-
- down_read(&htable_rw_lock);
-
- ce = lookup_cache_entry(npath, NULL);
- if (IS_ERR(ce)) {
- rc = PTR_ERR(ce);
- up_read(&htable_rw_lock);
- goto out_free_path;
- }
+ if (!path)
+ continue;
- if (!cache_entry_expired(ce)) {
- up_read(&htable_rw_lock);
- goto out_free_path;
- }
+ ses = find_ipc_from_server_path(sessions, path);
+ if (IS_ERR(ses))
+ goto next_referral;
- up_read(&htable_rw_lock);
+ xid = get_xid();
+ rc = get_dfs_referral(xid, ses, path, &refs, &numrefs);
+ free_xid(xid);
- /* If it's a DFS Link, then use root SMB session for refreshing it */
- if (is_dfs_link(npath)) {
- ses = root_ses = find_root_ses(vi, tcon, npath);
- if (IS_ERR(ses)) {
- rc = PTR_ERR(ses);
- root_ses = NULL;
- goto out_free_path;
+ if (!rc) {
+ down_write(&htable_rw_lock);
+ ce = lookup_cache_entry(path);
+ /*
+ * We need to re-check it because other tasks might have it deleted or
+ * updated.
+ */
+ if (!IS_ERR(ce) && cache_entry_expired(ce))
+ update_cache_entry_locked(ce, refs, numrefs);
+ up_write(&htable_rw_lock);
}
- } else {
- ses = tcon->ses;
- }
- rc = get_dfs_referral(xid, ses, cache_nlsc, tcon->remap, npath, &refs,
- &numrefs);
- if (!rc) {
- dump_refs(refs, numrefs);
- rc = update_cache_entry(npath, refs, numrefs);
+next_referral:
+ kfree(path);
free_dfs_info_array(refs, numrefs);
}
-
- if (root_ses)
- cifs_put_smb_ses(root_ses);
-
-out_free_path:
- free_normalized_path(path, npath);
-
-out_free_xid:
- free_xid(xid);
- return rc;
}
/*
- * Worker that will refresh DFS cache based on lowest TTL value from a DFS
+ * Worker that will refresh DFS cache and active mounts based on lowest TTL value from a DFS
* referral.
*/
static void refresh_cache_worker(struct work_struct *work)
{
- struct vol_info *vi, *nvi;
- struct TCP_Server_Info *server;
- LIST_HEAD(vols);
- LIST_HEAD(tcons);
- struct cifs_tcon *tcon, *ntcon;
- int rc;
-
- /*
- * Find SMB volumes that are eligible (server->tcpStatus == CifsGood)
- * for refreshing.
- */
- spin_lock(&vol_list_lock);
- list_for_each_entry(vi, &vol_list, list) {
- server = get_tcp_server(&vi->ctx);
- if (!server)
- continue;
-
- kref_get(&vi->refcnt);
- list_add_tail(&vi->rlist, &vols);
- put_tcp_server(server);
+ struct list_head mglist;
+ struct mount_group *mg, *tmp_mg;
+ struct cifs_ses *sessions[CACHE_MAX_ENTRIES + 1] = {NULL};
+ int max_sessions = ARRAY_SIZE(sessions) - 1;
+ int i = 0, count;
+
+ INIT_LIST_HEAD(&mglist);
+
+ /* Get refereces of mount groups */
+ mutex_lock(&mount_group_list_lock);
+ list_for_each_entry(mg, &mount_group_list, list) {
+ kref_get(&mg->refcount);
+ list_add(&mg->refresh_list, &mglist);
}
- spin_unlock(&vol_list_lock);
-
- /* Walk through all TCONs and refresh any expired cache entry */
- list_for_each_entry_safe(vi, nvi, &vols, rlist) {
- spin_lock(&vi->ctx_lock);
- server = get_tcp_server(&vi->ctx);
- spin_unlock(&vi->ctx_lock);
+ mutex_unlock(&mount_group_list_lock);
- if (!server)
- goto next_vol;
-
- get_tcons(server, &tcons);
- rc = 0;
-
- list_for_each_entry_safe(tcon, ntcon, &tcons, ulist) {
- /*
- * Skip tcp server if any of its tcons failed to refresh
- * (possibily due to reconnects).
- */
- if (!rc)
- rc = refresh_tcon(vi, tcon);
+ /* Fill in local array with an NULL-terminated list of all referral server sessions */
+ list_for_each_entry(mg, &mglist, refresh_list) {
+ if (i >= max_sessions)
+ break;
- list_del_init(&tcon->ulist);
- cifs_put_tcon(tcon);
- }
+ spin_lock(&mg->lock);
+ if (i + mg->num_sessions > max_sessions)
+ count = max_sessions - i;
+ else
+ count = mg->num_sessions;
+ memcpy(&sessions[i], mg->sessions, count * sizeof(mg->sessions[0]));
+ spin_unlock(&mg->lock);
+ i += count;
+ }
- put_tcp_server(server);
+ if (sessions[0]) {
+ /* Refresh all active mounts and cached entries */
+ refresh_mounts(sessions);
+ refresh_cache(sessions);
+ }
-next_vol:
- list_del_init(&vi->rlist);
- kref_put(&vi->refcnt, vol_release);
+ list_for_each_entry_safe(mg, tmp_mg, &mglist, refresh_list) {
+ list_del_init(&mg->refresh_list);
+ kref_put(&mg->refcount, mount_group_release);
}
spin_lock(&cache_ttl_lock);
diff --git a/fs/cifs/dfs_cache.h b/fs/cifs/dfs_cache.h
index 1afc4f590c47..b29d3ae64829 100644
--- a/fs/cifs/dfs_cache.h
+++ b/fs/cifs/dfs_cache.h
@@ -10,6 +10,7 @@
#include <linux/nls.h>
#include <linux/list.h>
+#include <linux/uuid.h>
#include "cifsglob.h"
struct dfs_cache_tgt_list {
@@ -23,34 +24,26 @@ struct dfs_cache_tgt_iterator {
struct list_head it_list;
};
-extern int dfs_cache_init(void);
-extern void dfs_cache_destroy(void);
+int dfs_cache_init(void);
+void dfs_cache_destroy(void);
extern const struct proc_ops dfscache_proc_ops;
-extern int dfs_cache_find(const unsigned int xid, struct cifs_ses *ses,
- const struct nls_table *nls_codepage, int remap,
- const char *path, struct dfs_info3_param *ref,
- struct dfs_cache_tgt_list *tgt_list);
-extern int dfs_cache_noreq_find(const char *path, struct dfs_info3_param *ref,
- struct dfs_cache_tgt_list *tgt_list);
-extern int dfs_cache_update_tgthint(const unsigned int xid,
- struct cifs_ses *ses,
- const struct nls_table *nls_codepage,
- int remap, const char *path,
- const struct dfs_cache_tgt_iterator *it);
-extern int
-dfs_cache_noreq_update_tgthint(const char *path,
- const struct dfs_cache_tgt_iterator *it);
-extern int dfs_cache_get_tgt_referral(const char *path,
- const struct dfs_cache_tgt_iterator *it,
- struct dfs_info3_param *ref);
-extern int dfs_cache_add_vol(char *mntdata, struct smb3_fs_context *ctx,
- const char *fullpath);
-extern int dfs_cache_update_vol(const char *fullpath,
- struct TCP_Server_Info *server);
-extern void dfs_cache_del_vol(const char *fullpath);
-extern int dfs_cache_get_tgt_share(char *path, const struct dfs_cache_tgt_iterator *it,
- char **share, char **prefix);
+int dfs_cache_find(const unsigned int xid, struct cifs_ses *ses, const struct nls_table *cp,
+ int remap, const char *path, struct dfs_info3_param *ref,
+ struct dfs_cache_tgt_list *tgt_list);
+int dfs_cache_noreq_find(const char *path, struct dfs_info3_param *ref,
+ struct dfs_cache_tgt_list *tgt_list);
+int dfs_cache_update_tgthint(const unsigned int xid, struct cifs_ses *ses,
+ const struct nls_table *cp, int remap, const char *path,
+ const struct dfs_cache_tgt_iterator *it);
+int dfs_cache_noreq_update_tgthint(const char *path, const struct dfs_cache_tgt_iterator *it);
+int dfs_cache_get_tgt_referral(const char *path, const struct dfs_cache_tgt_iterator *it,
+ struct dfs_info3_param *ref);
+int dfs_cache_get_tgt_share(char *path, const struct dfs_cache_tgt_iterator *it, char **share,
+ char **prefix);
+void dfs_cache_put_refsrv_sessions(const uuid_t *mount_id);
+void dfs_cache_add_refsrv_session(const uuid_t *mount_id, struct cifs_ses *ses);
+char *dfs_cache_canonical_path(const char *path, const struct nls_table *cp, int remap);
static inline struct dfs_cache_tgt_iterator *
dfs_cache_get_next_tgt(struct dfs_cache_tgt_list *tl,
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 6bcd3e8f7cda..79402ca0ddfa 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: LGPL-2.1
/*
* fs/cifs/dir.c
*
@@ -6,19 +7,6 @@
* Copyright (C) International Business Machines Corp., 2002,2009
* Author(s): Steve French (sfrench@us.ibm.com)
*
- * This library is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published
- * by the Free Software Foundation; either version 2.1 of the License, or
- * (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
- * the GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License
- * along with this library; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <linux/fs.h>
#include <linux/stat.h>
@@ -396,10 +384,11 @@ cifs_create_set_dentry:
goto out_err;
}
- if (S_ISDIR(newinode->i_mode)) {
- rc = -EISDIR;
- goto out_err;
- }
+ if (newinode)
+ if (S_ISDIR(newinode->i_mode)) {
+ rc = -EISDIR;
+ goto out_err;
+ }
d_drop(direntry);
d_add(direntry, newinode);
@@ -630,6 +619,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
struct inode *newInode = NULL;
const char *full_path;
void *page;
+ int retry_count = 0;
xid = get_xid();
@@ -673,6 +663,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
cifs_dbg(FYI, "Full path: %s inode = 0x%p\n",
full_path, d_inode(direntry));
+again:
if (pTcon->posix_extensions)
rc = smb311_posix_get_inode_info(&newInode, full_path, parent_dir_inode->i_sb, xid);
else if (pTcon->unix_ext) {
@@ -687,6 +678,8 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
/* since paths are not looked up by component - the parent
directories are presumed to be good here */
renew_parental_timestamps(direntry);
+ } else if (rc == -EAGAIN && retry_count++ < 10) {
+ goto again;
} else if (rc == -ENOENT) {
cifs_set_time(direntry, jiffies);
newInode = NULL;
diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c
index 534cbba72789..d15b82d569ef 100644
--- a/fs/cifs/dns_resolve.c
+++ b/fs/cifs/dns_resolve.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: LGPL-2.1
/*
* fs/cifs/dns_resolve.c
*
@@ -10,19 +11,6 @@
* Contains the CIFS DFS upcall routines used for hostname to
* IP address translation.
*
- * This library is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published
- * by the Free Software Foundation; either version 2.1 of the License, or
- * (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
- * the GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License
- * along with this library; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <linux/slab.h>
diff --git a/fs/cifs/dns_resolve.h b/fs/cifs/dns_resolve.h
index d3f5d27f4d06..5be060b82b13 100644
--- a/fs/cifs/dns_resolve.h
+++ b/fs/cifs/dns_resolve.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: LGPL-2.1 */
/*
* fs/cifs/dns_resolve.h -- DNS Resolver upcall management for CIFS DFS
* Handles host name to IP address resolution
@@ -5,19 +6,6 @@
* Copyright (c) International Business Machines Corp., 2008
* Author(s): Steve French (sfrench@us.ibm.com)
*
- * This library is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published
- * by the Free Software Foundation; either version 2.1 of the License, or
- * (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
- * the GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License
- * along with this library; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#ifndef _DNS_RESOLVE_H
diff --git a/fs/cifs/export.c b/fs/cifs/export.c
index eb0bb8ca8e63..747a540db954 100644
--- a/fs/cifs/export.c
+++ b/fs/cifs/export.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: LGPL-2.1
/*
* fs/cifs/export.c
*
@@ -8,19 +9,6 @@
*
* Operations related to support for exporting files via NFSD
*
- * This library is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published
- * by the Free Software Foundation; either version 2.1 of the License, or
- * (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
- * the GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License
- * along with this library; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
/*
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 6caad100c3f3..cd108607a070 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: LGPL-2.1
/*
* fs/cifs/file.c
*
@@ -7,19 +8,6 @@
* Author(s): Steve French (sfrench@us.ibm.com)
* Jeremy Allison (jra@samba.org)
*
- * This library is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published
- * by the Free Software Foundation; either version 2.1 of the License, or
- * (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
- * the GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License
- * along with this library; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <linux/fs.h>
#include <linux/backing-dev.h>
@@ -323,8 +311,7 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file,
cfile->dentry = dget(dentry);
cfile->f_flags = file->f_flags;
cfile->invalidHandle = false;
- cfile->oplock_break_received = false;
- cfile->deferred_scheduled = false;
+ cfile->deferred_close_scheduled = false;
cfile->tlink = cifs_get_tlink(tlink);
INIT_WORK(&cfile->oplock_break, cifs_oplock_break);
INIT_WORK(&cfile->put, cifsFileInfo_put_work);
@@ -574,21 +561,18 @@ int cifs_open(struct inode *inode, struct file *file)
file->f_op = &cifs_file_direct_ops;
}
- spin_lock(&CIFS_I(inode)->deferred_lock);
/* Get the cached handle as SMB2 close is deferred */
rc = cifs_get_readable_path(tcon, full_path, &cfile);
if (rc == 0) {
if (file->f_flags == cfile->f_flags) {
file->private_data = cfile;
+ spin_lock(&CIFS_I(inode)->deferred_lock);
cifs_del_deferred_close(cfile);
spin_unlock(&CIFS_I(inode)->deferred_lock);
goto out;
} else {
- spin_unlock(&CIFS_I(inode)->deferred_lock);
_cifsFileInfo_put(cfile, true, false);
}
- } else {
- spin_unlock(&CIFS_I(inode)->deferred_lock);
}
if (server->oplocks)
@@ -878,12 +862,8 @@ void smb2_deferred_work_close(struct work_struct *work)
struct cifsFileInfo, deferred.work);
spin_lock(&CIFS_I(d_inode(cfile->dentry))->deferred_lock);
- if (!cfile->deferred_scheduled) {
- spin_unlock(&CIFS_I(d_inode(cfile->dentry))->deferred_lock);
- return;
- }
cifs_del_deferred_close(cfile);
- cfile->deferred_scheduled = false;
+ cfile->deferred_close_scheduled = false;
spin_unlock(&CIFS_I(d_inode(cfile->dentry))->deferred_lock);
_cifsFileInfo_put(cfile, true, false);
}
@@ -900,19 +880,26 @@ int cifs_close(struct inode *inode, struct file *file)
file->private_data = NULL;
dclose = kmalloc(sizeof(struct cifs_deferred_close), GFP_KERNEL);
if ((cinode->oplock == CIFS_CACHE_RHW_FLG) &&
+ cinode->lease_granted &&
dclose) {
if (test_bit(CIFS_INO_MODIFIED_ATTR, &cinode->flags))
inode->i_ctime = inode->i_mtime = current_time(inode);
spin_lock(&cinode->deferred_lock);
cifs_add_deferred_close(cfile, dclose);
- if (cfile->deferred_scheduled) {
- mod_delayed_work(deferredclose_wq,
- &cfile->deferred, cifs_sb->ctx->acregmax);
+ if (cfile->deferred_close_scheduled &&
+ delayed_work_pending(&cfile->deferred)) {
+ /*
+ * If there is no pending work, mod_delayed_work queues new work.
+ * So, Increase the ref count to avoid use-after-free.
+ */
+ if (!mod_delayed_work(deferredclose_wq,
+ &cfile->deferred, cifs_sb->ctx->acregmax))
+ cifsFileInfo_get(cfile);
} else {
/* Deferred close for files */
queue_delayed_work(deferredclose_wq,
&cfile->deferred, cifs_sb->ctx->acregmax);
- cfile->deferred_scheduled = true;
+ cfile->deferred_close_scheduled = true;
spin_unlock(&cinode->deferred_lock);
return 0;
}
@@ -2020,8 +2007,7 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode,
if (fsuid_only && !uid_eq(open_file->uid, current_fsuid()))
continue;
if (OPEN_FMODE(open_file->f_flags) & FMODE_READ) {
- if ((!open_file->invalidHandle) &&
- (!open_file->oplock_break_received)) {
+ if ((!open_file->invalidHandle)) {
/* found a good file */
/* lock it so it will not be closed on us */
cifsFileInfo_get(open_file);
@@ -4874,14 +4860,20 @@ oplock_break_ack:
}
/*
* When oplock break is received and there are no active
- * file handles but cached, then set the flag oplock_break_received.
+ * file handles but cached, then schedule deferred close immediately.
* So, new open will not use cached handle.
*/
spin_lock(&CIFS_I(inode)->deferred_lock);
is_deferred = cifs_is_deferred_close(cfile, &dclose);
- if (is_deferred && cfile->deferred_scheduled) {
- cfile->oplock_break_received = true;
- mod_delayed_work(deferredclose_wq, &cfile->deferred, 0);
+ if (is_deferred &&
+ cfile->deferred_close_scheduled &&
+ delayed_work_pending(&cfile->deferred)) {
+ /*
+ * If there is no pending work, mod_delayed_work queues new work.
+ * So, Increase the ref count to avoid use-after-free.
+ */
+ if (!mod_delayed_work(deferredclose_wq, &cfile->deferred, 0))
+ cifsFileInfo_get(cfile);
}
spin_unlock(&CIFS_I(inode)->deferred_lock);
_cifsFileInfo_put(cfile, false /* do not wait for ourself */, false);
diff --git a/fs/cifs/fs_context.c b/fs/cifs/fs_context.c
index 5d21cd905315..92d4ab029c91 100644
--- a/fs/cifs/fs_context.c
+++ b/fs/cifs/fs_context.c
@@ -1145,7 +1145,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
/* if iocharset not set then load_nls_default
* is used by caller
*/
- cifs_dbg(FYI, "iocharset set to %s\n", ctx->iocharset);
+ cifs_dbg(FYI, "iocharset set to %s\n", ctx->iocharset);
break;
case Opt_netbiosname:
memset(ctx->source_rfc1001_name, 0x20,
diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c
index 20d24af33ee2..dd625033cd6b 100644
--- a/fs/cifs/fscache.c
+++ b/fs/cifs/fscache.c
@@ -1,22 +1,10 @@
+// SPDX-License-Identifier: LGPL-2.1
/*
* fs/cifs/fscache.c - CIFS filesystem cache interface
*
* Copyright (c) 2010 Novell, Inc.
* Author(s): Suresh Jayaraman <sjayaraman@suse.de>
*
- * This library is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published
- * by the Free Software Foundation; either version 2.1 of the License, or
- * (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
- * the GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License
- * along with this library; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include "fscache.h"
#include "cifsglob.h"
diff --git a/fs/cifs/fscache.h b/fs/cifs/fscache.h
index e811f2dd7619..3d55cb2ef055 100644
--- a/fs/cifs/fscache.h
+++ b/fs/cifs/fscache.h
@@ -1,22 +1,10 @@
+/* SPDX-License-Identifier: LGPL-2.1 */
/*
* fs/cifs/fscache.h - CIFS filesystem cache interface definitions
*
* Copyright (c) 2010 Novell, Inc.
* Authors(s): Suresh Jayaraman (sjayaraman@suse.de>
*
- * This library is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published
- * by the Free Software Foundation; either version 2.1 of the License, or
- * (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
- * the GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License
- * along with this library; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#ifndef _CIFS_FSCACHE_H
#define _CIFS_FSCACHE_H
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 1dfa57982522..b96b253e7635 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1,22 +1,10 @@
+// SPDX-License-Identifier: LGPL-2.1
/*
* fs/cifs/inode.c
*
* Copyright (C) International Business Machines Corp., 2002,2010
* Author(s): Steve French (sfrench@us.ibm.com)
*
- * This library is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published
- * by the Free Software Foundation; either version 2.1 of the License, or
- * (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
- * the GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License
- * along with this library; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <linux/fs.h>
#include <linux/stat.h>
@@ -367,9 +355,12 @@ cifs_get_file_info_unix(struct file *filp)
} else if (rc == -EREMOTE) {
cifs_create_dfs_fattr(&fattr, inode->i_sb);
rc = 0;
- }
+ } else
+ goto cifs_gfiunix_out;
rc = cifs_fattr_to_inode(inode, &fattr);
+
+cifs_gfiunix_out:
free_xid(xid);
return rc;
}
diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c
index 28ec8d7c521a..42c6a0bac6c8 100644
--- a/fs/cifs/ioctl.c
+++ b/fs/cifs/ioctl.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: LGPL-2.1
/*
* fs/cifs/ioctl.c
*
@@ -6,19 +7,6 @@
* Copyright (C) International Business Machines Corp., 2005,2013
* Author(s): Steve French (sfrench@us.ibm.com)
*
- * This library is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published
- * by the Free Software Foundation; either version 2.1 of the License, or
- * (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
- * the GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License
- * along with this library; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <linux/fs.h>
@@ -33,6 +21,7 @@
#include "cifsfs.h"
#include "cifs_ioctl.h"
#include "smb2proto.h"
+#include "smb2glob.h"
#include <linux/btrfs.h>
static long cifs_ioctl_query_info(unsigned int xid, struct file *filep,
@@ -214,48 +203,112 @@ static int cifs_shutdown(struct super_block *sb, unsigned long arg)
return 0;
}
-static int cifs_dump_full_key(struct cifs_tcon *tcon, unsigned long arg)
+static int cifs_dump_full_key(struct cifs_tcon *tcon, struct smb3_full_key_debug_info __user *in)
{
- struct smb3_full_key_debug_info pfull_key_inf;
- __u64 suid;
- struct list_head *tmp;
+ struct smb3_full_key_debug_info out;
struct cifs_ses *ses;
+ int rc = 0;
bool found = false;
+ u8 __user *end;
- if (!smb3_encryption_required(tcon))
- return -EOPNOTSUPP;
+ if (!smb3_encryption_required(tcon)) {
+ rc = -EOPNOTSUPP;
+ goto out;
+ }
+
+ /* copy user input into our output buffer */
+ if (copy_from_user(&out, in, sizeof(out))) {
+ rc = -EINVAL;
+ goto out;
+ }
+
+ if (!out.session_id) {
+ /* if ses id is 0, use current user session */
+ ses = tcon->ses;
+ } else {
+ /* otherwise if a session id is given, look for it in all our sessions */
+ struct cifs_ses *ses_it = NULL;
+ struct TCP_Server_Info *server_it = NULL;
- ses = tcon->ses; /* default to user id for current user */
- if (get_user(suid, (__u64 __user *)arg))
- suid = 0;
- if (suid) {
- /* search to see if there is a session with a matching SMB UID */
spin_lock(&cifs_tcp_ses_lock);
- list_for_each(tmp, &tcon->ses->server->smb_ses_list) {
- ses = list_entry(tmp, struct cifs_ses, smb_ses_list);
- if (ses->Suid == suid) {
- found = true;
- break;
+ list_for_each_entry(server_it, &cifs_tcp_ses_list, tcp_ses_list) {
+ list_for_each_entry(ses_it, &server_it->smb_ses_list, smb_ses_list) {
+ if (ses_it->Suid == out.session_id) {
+ ses = ses_it;
+ /*
+ * since we are using the session outside the crit
+ * section, we need to make sure it won't be released
+ * so increment its refcount
+ */
+ ses->ses_count++;
+ found = true;
+ goto search_end;
+ }
}
}
+search_end:
spin_unlock(&cifs_tcp_ses_lock);
- if (found == false)
- return -EINVAL;
- } /* else uses default user's SMB UID (ie current user) */
-
- pfull_key_inf.cipher_type = le16_to_cpu(ses->server->cipher_type);
- pfull_key_inf.Suid = ses->Suid;
- memcpy(pfull_key_inf.auth_key, ses->auth_key.response,
- 16 /* SMB2_NTLMV2_SESSKEY_SIZE */);
- memcpy(pfull_key_inf.smb3decryptionkey, ses->smb3decryptionkey,
- 32 /* SMB3_ENC_DEC_KEY_SIZE */);
- memcpy(pfull_key_inf.smb3encryptionkey,
- ses->smb3encryptionkey, 32 /* SMB3_ENC_DEC_KEY_SIZE */);
- if (copy_to_user((void __user *)arg, &pfull_key_inf,
- sizeof(struct smb3_full_key_debug_info)))
- return -EFAULT;
+ if (!found) {
+ rc = -ENOENT;
+ goto out;
+ }
+ }
- return 0;
+ switch (ses->server->cipher_type) {
+ case SMB2_ENCRYPTION_AES128_CCM:
+ case SMB2_ENCRYPTION_AES128_GCM:
+ out.session_key_length = CIFS_SESS_KEY_SIZE;
+ out.server_in_key_length = out.server_out_key_length = SMB3_GCM128_CRYPTKEY_SIZE;
+ break;
+ case SMB2_ENCRYPTION_AES256_CCM:
+ case SMB2_ENCRYPTION_AES256_GCM:
+ out.session_key_length = CIFS_SESS_KEY_SIZE;
+ out.server_in_key_length = out.server_out_key_length = SMB3_GCM256_CRYPTKEY_SIZE;
+ break;
+ default:
+ rc = -EOPNOTSUPP;
+ goto out;
+ }
+
+ /* check if user buffer is big enough to store all the keys */
+ if (out.in_size < sizeof(out) + out.session_key_length + out.server_in_key_length
+ + out.server_out_key_length) {
+ rc = -ENOBUFS;
+ goto out;
+ }
+
+ out.session_id = ses->Suid;
+ out.cipher_type = le16_to_cpu(ses->server->cipher_type);
+
+ /* overwrite user input with our output */
+ if (copy_to_user(in, &out, sizeof(out))) {
+ rc = -EINVAL;
+ goto out;
+ }
+
+ /* append all the keys at the end of the user buffer */
+ end = in->data;
+ if (copy_to_user(end, ses->auth_key.response, out.session_key_length)) {
+ rc = -EINVAL;
+ goto out;
+ }
+ end += out.session_key_length;
+
+ if (copy_to_user(end, ses->smb3encryptionkey, out.server_in_key_length)) {
+ rc = -EINVAL;
+ goto out;
+ }
+ end += out.server_in_key_length;
+
+ if (copy_to_user(end, ses->smb3decryptionkey, out.server_out_key_length)) {
+ rc = -EINVAL;
+ goto out;
+ }
+
+out:
+ if (found)
+ cifs_put_smb_ses(ses);
+ return rc;
}
long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
@@ -371,6 +424,10 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
rc = -EOPNOTSUPP;
break;
case CIFS_DUMP_KEY:
+ /*
+ * Dump encryption keys. This is an old ioctl that only
+ * handles AES-128-{CCM,GCM}.
+ */
if (pSMBFile == NULL)
break;
if (!capable(CAP_SYS_ADMIN)) {
@@ -398,11 +455,10 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
else
rc = 0;
break;
- /*
- * Dump full key (32 bytes instead of 16 bytes) is
- * needed if GCM256 (stronger encryption) negotiated
- */
case CIFS_DUMP_FULL_KEY:
+ /*
+ * Dump encryption keys (handles any key sizes)
+ */
if (pSMBFile == NULL)
break;
if (!capable(CAP_SYS_ADMIN)) {
@@ -410,8 +466,7 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
break;
}
tcon = tlink_tcon(pSMBFile->tlink);
- rc = cifs_dump_full_key(tcon, arg);
-
+ rc = cifs_dump_full_key(tcon, (void __user *)arg);
break;
case CIFS_IOC_NOTIFY:
if (!S_ISDIR(inode->i_mode)) {
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 970fcf2adb08..f0a6d63bc08c 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -1,22 +1,10 @@
+// SPDX-License-Identifier: LGPL-2.1
/*
* fs/cifs/link.c
*
* Copyright (C) International Business Machines Corp., 2002,2008
* Author(s): Steve French (sfrench@us.ibm.com)
*
- * This library is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published
- * by the Free Software Foundation; either version 2.1 of the License, or
- * (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
- * the GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License
- * along with this library; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <linux/fs.h>
#include <linux/stat.h>
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 524dbdfb7184..184138b4eb8c 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -1,22 +1,10 @@
+// SPDX-License-Identifier: LGPL-2.1
/*
* fs/cifs/misc.c
*
* Copyright (C) International Business Machines Corp., 2002,2008
* Author(s): Steve French (sfrench@us.ibm.com)
*
- * This library is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published
- * by the Free Software Foundation; either version 2.1 of the License, or
- * (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
- * the GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License
- * along with this library; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <linux/slab.h>
@@ -672,6 +660,11 @@ cifs_add_pending_open(struct cifs_fid *fid, struct tcon_link *tlink,
spin_unlock(&tlink_tcon(open->tlink)->open_file_lock);
}
+/*
+ * Critical section which runs after acquiring deferred_lock.
+ * As there is no reference count on cifs_deferred_close, pdclose
+ * should not be used outside deferred_lock.
+ */
bool
cifs_is_deferred_close(struct cifsFileInfo *cfile, struct cifs_deferred_close **pdclose)
{
@@ -688,6 +681,9 @@ cifs_is_deferred_close(struct cifsFileInfo *cfile, struct cifs_deferred_close **
return false;
}
+/*
+ * Critical section which runs after acquiring deferred_lock.
+ */
void
cifs_add_deferred_close(struct cifsFileInfo *cfile, struct cifs_deferred_close *dclose)
{
@@ -707,6 +703,9 @@ cifs_add_deferred_close(struct cifsFileInfo *cfile, struct cifs_deferred_close *
list_add_tail(&dclose->dlist, &CIFS_I(d_inode(cfile->dentry))->deferred_closes);
}
+/*
+ * Critical section which runs after acquiring deferred_lock.
+ */
void
cifs_del_deferred_close(struct cifsFileInfo *cfile)
{
@@ -738,15 +737,19 @@ void
cifs_close_all_deferred_files(struct cifs_tcon *tcon)
{
struct cifsFileInfo *cfile;
- struct cifsInodeInfo *cinode;
struct list_head *tmp;
spin_lock(&tcon->open_file_lock);
list_for_each(tmp, &tcon->openFileList) {
cfile = list_entry(tmp, struct cifsFileInfo, tlist);
- cinode = CIFS_I(d_inode(cfile->dentry));
- if (delayed_work_pending(&cfile->deferred))
- mod_delayed_work(deferredclose_wq, &cfile->deferred, 0);
+ if (delayed_work_pending(&cfile->deferred)) {
+ /*
+ * If there is no pending work, mod_delayed_work queues new work.
+ * So, Increase the ref count to avoid use-after-free.
+ */
+ if (!mod_delayed_work(deferredclose_wq, &cfile->deferred, 0))
+ cifsFileInfo_get(cfile);
+ }
}
spin_unlock(&tcon->open_file_lock);
}
diff --git a/fs/cifs/netlink.c b/fs/cifs/netlink.c
index 5aaabe4cc0a7..291cb606f149 100644
--- a/fs/cifs/netlink.c
+++ b/fs/cifs/netlink.c
@@ -30,7 +30,7 @@ static const struct nla_policy cifs_genl_policy[CIFS_GENL_ATTR_MAX + 1] = {
[CIFS_GENL_ATTR_SWN_RESOURCE_NAME] = { .type = NLA_STRING},
};
-static struct genl_ops cifs_genl_ops[] = {
+static const struct genl_ops cifs_genl_ops[] = {
{
.cmd = CIFS_GENL_CMD_SWN_NOTIFY,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
diff --git a/fs/cifs/ntlmssp.h b/fs/cifs/ntlmssp.h
index 3079b38f0afb..378133ce8869 100644
--- a/fs/cifs/ntlmssp.h
+++ b/fs/cifs/ntlmssp.h
@@ -1,22 +1,10 @@
+/* SPDX-License-Identifier: LGPL-2.1 */
/*
* fs/cifs/ntlmssp.h
*
* Copyright (c) International Business Machines Corp., 2002,2007
* Author(s): Steve French (sfrench@us.ibm.com)
*
- * This library is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published
- * by the Free Software Foundation; either version 2.1 of the License, or
- * (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
- * the GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License
- * along with this library; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#define NTLMSSP_SIGNATURE "NTLMSSP"
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 63bfc533c9fb..bfee176b901d 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: LGPL-2.1
/*
* fs/cifs/readdir.c
*
@@ -7,19 +8,6 @@
* Copyright (C) Red Hat, Inc., 2011
* Author(s): Steve French (sfrench@us.ibm.com)
*
- * This library is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published
- * by the Free Software Foundation; either version 2.1 of the License, or
- * (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
- * the GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License
- * along with this library; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <linux/fs.h>
#include <linux/pagemap.h>
@@ -321,7 +309,7 @@ static void cifs_fulldir_info_to_fattr(struct cifs_fattr *fattr,
{
__dir_info_to_fattr(fattr, info);
- /* See MS-FSCC 2.4.18 FileIdFullDirectoryInformation */
+ /* See MS-FSCC 2.4.19 FileIdFullDirectoryInformation */
if (fattr->cf_cifsattrs & ATTR_REPARSE)
fattr->cf_cifstag = le32_to_cpu(info->EaSize);
cifs_fill_common_info(fattr, cifs_sb);
diff --git a/fs/cifs/rfc1002pdu.h b/fs/cifs/rfc1002pdu.h
index 8b69fcceb597..137f7c95afd6 100644
--- a/fs/cifs/rfc1002pdu.h
+++ b/fs/cifs/rfc1002pdu.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: LGPL-2.1 */
/*
* fs/cifs/rfc1002pdu.h
*
@@ -6,19 +7,6 @@
* Copyright (c) International Business Machines Corp., 2004
* Author(s): Steve French (sfrench@us.ibm.com)
*
- * This library is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published
- * by the Free Software Foundation; either version 2.1 of the License, or
- * (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
- * the GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License
- * along with this library; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
/* NB: unlike smb/cifs packets, the RFC1002 structures are big endian */
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index a92a1fb7cb52..c5785fd3f52e 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: LGPL-2.1
/*
* fs/cifs/sess.c
*
@@ -6,19 +7,6 @@
* Copyright (c) International Business Machines Corp., 2006, 2009
* Author(s): Steve French (sfrench@us.ibm.com)
*
- * This library is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published
- * by the Free Software Foundation; either version 2.1 of the License, or
- * (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
- * the GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License
- * along with this library; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include "cifspdu.h"
@@ -195,7 +183,7 @@ cifs_ses_add_channel(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses,
ses, iface->speed, iface->rdma_capable ? "yes" : "no",
&ipv4->sin_addr);
else
- cifs_dbg(FYI, "adding channel to ses %p (speed:%zu bps rdma:%s ip:%pI4)\n",
+ cifs_dbg(FYI, "adding channel to ses %p (speed:%zu bps rdma:%s ip:%pI6)\n",
ses, iface->speed, iface->rdma_capable ? "yes" : "no",
&ipv6->sin6_addr);
diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c
index 2fa3ba354cc9..c9d8a50062b8 100644
--- a/fs/cifs/smb2file.c
+++ b/fs/cifs/smb2file.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: LGPL-2.1
/*
* fs/cifs/smb2file.c
*
@@ -5,19 +6,6 @@
* Author(s): Steve French (sfrench@us.ibm.com),
* Pavel Shilovsky ((pshilovsky@samba.org) 2012
*
- * This library is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published
- * by the Free Software Foundation; either version 2.1 of the License, or
- * (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
- * the GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License
- * along with this library; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <linux/fs.h>
#include <linux/stat.h>
diff --git a/fs/cifs/smb2glob.h b/fs/cifs/smb2glob.h
index d9a990c99121..d0e9f3782bd9 100644
--- a/fs/cifs/smb2glob.h
+++ b/fs/cifs/smb2glob.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: LGPL-2.1 */
/*
* fs/cifs/smb2glob.h
*
@@ -9,16 +10,6 @@
* Jeremy Allison (jra@samba.org)
* Pavel Shilovsky (pshilovsky@samba.org) 2012
*
- * This library is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published
- * by the Free Software Foundation; either version 2.1 of the License, or
- * (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
- * the GNU Lesser General Public License for more details.
- *
*/
#ifndef _SMB2_GLOB_H
#define _SMB2_GLOB_H
diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c
index 9a61209a283e..957b2594f02e 100644
--- a/fs/cifs/smb2inode.c
+++ b/fs/cifs/smb2inode.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: LGPL-2.1
/*
* fs/cifs/smb2inode.c
*
@@ -6,19 +7,6 @@
* Author(s): Pavel Shilovsky (pshilovsky@samba.org),
* Steve French (sfrench@us.ibm.com)
*
- * This library is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published
- * by the Free Software Foundation; either version 2.1 of the License, or
- * (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
- * the GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License
- * along with this library; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <linux/fs.h>
#include <linux/stat.h>
diff --git a/fs/cifs/smb2maperror.c b/fs/cifs/smb2maperror.c
index c775682ee973..cea39bcecbab 100644
--- a/fs/cifs/smb2maperror.c
+++ b/fs/cifs/smb2maperror.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: LGPL-2.1
/*
* fs/smb2/smb2maperror.c
*
@@ -6,19 +7,6 @@
* Copyright (C) International Business Machines Corp., 2009
* Author(s): Steve French (sfrench@us.ibm.com)
*
- * This library is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published
- * by the Free Software Foundation; either version 2.1 of the License, or
- * (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
- * the GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License
- * along with this library; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <linux/errno.h>
#include "cifsglob.h"
diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c
index 06d555d4da9a..668f77108831 100644
--- a/fs/cifs/smb2misc.c
+++ b/fs/cifs/smb2misc.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: LGPL-2.1
/*
* fs/cifs/smb2misc.c
*
@@ -6,19 +7,6 @@
* Author(s): Steve French (sfrench@us.ibm.com)
* Pavel Shilovsky (pshilovsky@samba.org) 2012
*
- * This library is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published
- * by the Free Software Foundation; either version 2.1 of the License, or
- * (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
- * the GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License
- * along with this library; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <linux/ctype.h>
#include "smb2pdu.h"
@@ -164,19 +152,16 @@ smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *srvr)
struct smb2_transform_hdr *thdr =
(struct smb2_transform_hdr *)buf;
struct cifs_ses *ses = NULL;
- struct list_head *tmp;
/* decrypt frame now that it is completely read in */
spin_lock(&cifs_tcp_ses_lock);
- list_for_each(tmp, &srvr->smb_ses_list) {
- ses = list_entry(tmp, struct cifs_ses, smb_ses_list);
+ list_for_each_entry(ses, &srvr->smb_ses_list, smb_ses_list) {
if (ses->Suid == thdr->SessionId)
break;
-
- ses = NULL;
}
spin_unlock(&cifs_tcp_ses_lock);
- if (ses == NULL) {
+ if (list_entry_is_head(ses, &srvr->smb_ses_list,
+ smb_ses_list)) {
cifs_dbg(VFS, "no decryption - session id not found\n");
return 1;
}
@@ -548,7 +533,6 @@ static bool
smb2_tcon_has_lease(struct cifs_tcon *tcon, struct smb2_lease_break *rsp)
{
__u8 lease_state;
- struct list_head *tmp;
struct cifsFileInfo *cfile;
struct cifsInodeInfo *cinode;
int ack_req = le32_to_cpu(rsp->Flags &
@@ -556,8 +540,7 @@ smb2_tcon_has_lease(struct cifs_tcon *tcon, struct smb2_lease_break *rsp)
lease_state = le32_to_cpu(rsp->NewLeaseState);
- list_for_each(tmp, &tcon->openFileList) {
- cfile = list_entry(tmp, struct cifsFileInfo, tlist);
+ list_for_each_entry(cfile, &tcon->openFileList, tlist) {
cinode = CIFS_I(d_inode(cfile->dentry));
if (memcmp(cinode->lease_key, rsp->LeaseKey,
@@ -618,7 +601,6 @@ static bool
smb2_is_valid_lease_break(char *buffer)
{
struct smb2_lease_break *rsp = (struct smb2_lease_break *)buffer;
- struct list_head *tmp, *tmp1, *tmp2;
struct TCP_Server_Info *server;
struct cifs_ses *ses;
struct cifs_tcon *tcon;
@@ -628,15 +610,9 @@ smb2_is_valid_lease_break(char *buffer)
/* look up tcon based on tid & uid */
spin_lock(&cifs_tcp_ses_lock);
- list_for_each(tmp, &cifs_tcp_ses_list) {
- server = list_entry(tmp, struct TCP_Server_Info, tcp_ses_list);
-
- list_for_each(tmp1, &server->smb_ses_list) {
- ses = list_entry(tmp1, struct cifs_ses, smb_ses_list);
-
- list_for_each(tmp2, &ses->tcon_list) {
- tcon = list_entry(tmp2, struct cifs_tcon,
- tcon_list);
+ list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) {
+ list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
+ list_for_each_entry(tcon, &ses->tcon_list, tcon_list) {
spin_lock(&tcon->open_file_lock);
cifs_stats_inc(
&tcon->stats.cifs_stats.num_oplock_brks);
@@ -687,7 +663,6 @@ bool
smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server)
{
struct smb2_oplock_break *rsp = (struct smb2_oplock_break *)buffer;
- struct list_head *tmp, *tmp1, *tmp2;
struct cifs_ses *ses;
struct cifs_tcon *tcon;
struct cifsInodeInfo *cinode;
@@ -710,16 +685,11 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server)
/* look up tcon based on tid & uid */
spin_lock(&cifs_tcp_ses_lock);
- list_for_each(tmp, &server->smb_ses_list) {
- ses = list_entry(tmp, struct cifs_ses, smb_ses_list);
-
- list_for_each(tmp1, &ses->tcon_list) {
- tcon = list_entry(tmp1, struct cifs_tcon, tcon_list);
+ list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
+ list_for_each_entry(tcon, &ses->tcon_list, tcon_list) {
spin_lock(&tcon->open_file_lock);
- list_for_each(tmp2, &tcon->openFileList) {
- cfile = list_entry(tmp2, struct cifsFileInfo,
- tlist);
+ list_for_each_entry(cfile, &tcon->openFileList, tlist) {
if (rsp->PersistentFid !=
cfile->fid.persistent_fid ||
rsp->VolatileFid !=
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index dd0eb665b680..e4c8f603dd58 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -388,7 +388,9 @@ smb2_negotiate(const unsigned int xid, struct cifs_ses *ses)
{
int rc;
+ spin_lock(&GlobalMid_Lock);
cifs_ses_server(ses)->CurrentMid = 0;
+ spin_unlock(&GlobalMid_Lock);
rc = SMB2_negotiate(xid, ses);
/* BB we probably don't need to retry with modern servers */
if (rc == -EAGAIN)
@@ -1861,6 +1863,8 @@ smb2_copychunk_range(const unsigned int xid,
cpu_to_le32(min_t(u32, len, tcon->max_bytes_chunk));
/* Request server copy to target from src identified by key */
+ kfree(retbuf);
+ retbuf = NULL;
rc = SMB2_ioctl(xid, tcon, trgtfile->fid.persistent_fid,
trgtfile->fid.volatile_fid, FSCTL_SRV_COPYCHUNK_WRITE,
true /* is_fsctl */, (char *)pcchunk,
@@ -2323,6 +2327,7 @@ smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon,
struct smb2_query_directory_rsp *qd_rsp = NULL;
struct smb2_create_rsp *op_rsp = NULL;
struct TCP_Server_Info *server = cifs_pick_channel(tcon->ses);
+ int retry_count = 0;
utf16_path = cifs_convert_path_to_utf16(path, cifs_sb);
if (!utf16_path)
@@ -2370,10 +2375,14 @@ smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon,
smb2_set_related(&rqst[1]);
+again:
rc = compound_send_recv(xid, tcon->ses, server,
flags, 2, rqst,
resp_buftype, rsp_iov);
+ if (rc == -EAGAIN && retry_count++ < 10)
+ goto again;
+
/* If the open failed there is nothing to do */
op_rsp = (struct smb2_create_rsp *)rsp_iov[0].iov_base;
if (op_rsp == NULL || op_rsp->sync_hdr.Status != STATUS_SUCCESS) {
@@ -3599,6 +3608,119 @@ static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon,
return rc;
}
+static int smb3_simple_fallocate_write_range(unsigned int xid,
+ struct cifs_tcon *tcon,
+ struct cifsFileInfo *cfile,
+ loff_t off, loff_t len,
+ char *buf)
+{
+ struct cifs_io_parms io_parms = {0};
+ int nbytes;
+ struct kvec iov[2];
+
+ io_parms.netfid = cfile->fid.netfid;
+ io_parms.pid = current->tgid;
+ io_parms.tcon = tcon;
+ io_parms.persistent_fid = cfile->fid.persistent_fid;
+ io_parms.volatile_fid = cfile->fid.volatile_fid;
+ io_parms.offset = off;
+ io_parms.length = len;
+
+ /* iov[0] is reserved for smb header */
+ iov[1].iov_base = buf;
+ iov[1].iov_len = io_parms.length;
+ return SMB2_write(xid, &io_parms, &nbytes, iov, 1);
+}
+
+static int smb3_simple_fallocate_range(unsigned int xid,
+ struct cifs_tcon *tcon,
+ struct cifsFileInfo *cfile,
+ loff_t off, loff_t len)
+{
+ struct file_allocated_range_buffer in_data, *out_data = NULL, *tmp_data;
+ u32 out_data_len;
+ char *buf = NULL;
+ loff_t l;
+ int rc;
+
+ in_data.file_offset = cpu_to_le64(off);
+ in_data.length = cpu_to_le64(len);
+ rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid,
+ cfile->fid.volatile_fid,
+ FSCTL_QUERY_ALLOCATED_RANGES, true,
+ (char *)&in_data, sizeof(in_data),
+ 1024 * sizeof(struct file_allocated_range_buffer),
+ (char **)&out_data, &out_data_len);
+ if (rc)
+ goto out;
+ /*
+ * It is already all allocated
+ */
+ if (out_data_len == 0)
+ goto out;
+
+ buf = kzalloc(1024 * 1024, GFP_KERNEL);
+ if (buf == NULL) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ tmp_data = out_data;
+ while (len) {
+ /*
+ * The rest of the region is unmapped so write it all.
+ */
+ if (out_data_len == 0) {
+ rc = smb3_simple_fallocate_write_range(xid, tcon,
+ cfile, off, len, buf);
+ goto out;
+ }
+
+ if (out_data_len < sizeof(struct file_allocated_range_buffer)) {
+ rc = -EINVAL;
+ goto out;
+ }
+
+ if (off < le64_to_cpu(tmp_data->file_offset)) {
+ /*
+ * We are at a hole. Write until the end of the region
+ * or until the next allocated data,
+ * whichever comes next.
+ */
+ l = le64_to_cpu(tmp_data->file_offset) - off;
+ if (len < l)
+ l = len;
+ rc = smb3_simple_fallocate_write_range(xid, tcon,
+ cfile, off, l, buf);
+ if (rc)
+ goto out;
+ off = off + l;
+ len = len - l;
+ if (len == 0)
+ goto out;
+ }
+ /*
+ * We are at a section of allocated data, just skip forward
+ * until the end of the data or the end of the region
+ * we are supposed to fallocate, whichever comes first.
+ */
+ l = le64_to_cpu(tmp_data->length);
+ if (len < l)
+ l = len;
+ off += l;
+ len -= l;
+
+ tmp_data = &tmp_data[1];
+ out_data_len -= sizeof(struct file_allocated_range_buffer);
+ }
+
+ out:
+ kfree(out_data);
+ kfree(buf);
+ return rc;
+}
+
+
static long smb3_simple_falloc(struct file *file, struct cifs_tcon *tcon,
loff_t off, loff_t len, bool keep_size)
{
@@ -3660,6 +3782,26 @@ static long smb3_simple_falloc(struct file *file, struct cifs_tcon *tcon,
if ((keep_size == true) || (i_size_read(inode) >= off + len)) {
/*
+ * At this point, we are trying to fallocate an internal
+ * regions of a sparse file. Since smb2 does not have a
+ * fallocate command we have two otions on how to emulate this.
+ * We can either turn the entire file to become non-sparse
+ * which we only do if the fallocate is for virtually
+ * the whole file, or we can overwrite the region with zeroes
+ * using SMB2_write, which could be prohibitevly expensive
+ * if len is large.
+ */
+ /*
+ * We are only trying to fallocate a small region so
+ * just write it with zero.
+ */
+ if (len <= 1024 * 1024) {
+ rc = smb3_simple_fallocate_range(xid, tcon, cfile,
+ off, len);
+ goto out;
+ }
+
+ /*
* Check if falloc starts within first few pages of file
* and ends within a few pages of the end of file to
* ensure that most of file is being forced to be
@@ -3981,6 +4123,7 @@ smb2_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock,
unsigned int epoch, bool *purge_cache)
{
oplock &= 0xFF;
+ cinode->lease_granted = false;
if (oplock == SMB2_OPLOCK_LEVEL_NOCHANGE)
return;
if (oplock == SMB2_OPLOCK_LEVEL_BATCH) {
@@ -4007,6 +4150,7 @@ smb21_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock,
unsigned int new_oplock = 0;
oplock &= 0xFF;
+ cinode->lease_granted = true;
if (oplock == SMB2_OPLOCK_LEVEL_NOCHANGE)
return;
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index a8bf43184773..962826dc3316 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: LGPL-2.1
/*
* fs/cifs/smb2pdu.c
*
@@ -8,19 +9,6 @@
*
* Contains the routines for constructing the SMB2 PDUs themselves
*
- * This library is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published
- * by the Free Software Foundation; either version 2.1 of the License, or
- * (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
- * the GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License
- * along with this library; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
/* SMB2 PDU handling routines here - except for leftovers (eg session setup) */
@@ -958,6 +946,13 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
/* Internal types */
server->capabilities |= SMB2_NT_FIND | SMB2_LARGE_FILES;
+ /*
+ * SMB3.0 supports only 1 cipher and doesn't have a encryption neg context
+ * Set the cipher type manually.
+ */
+ if (server->dialect == SMB30_PROT_ID && (server->capabilities & SMB2_GLOBAL_CAP_ENCRYPTION))
+ server->cipher_type = SMB2_ENCRYPTION_AES128_CCM;
+
security_blob = smb2_get_data_area_len(&blob_offset, &blob_length,
(struct smb2_sync_hdr *)rsp);
/*
@@ -1784,10 +1779,8 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree,
rsp = (struct smb2_tree_connect_rsp *)rsp_iov.iov_base;
trace_smb3_tcon(xid, tcon->tid, ses->Suid, tree, rc);
if (rc != 0) {
- if (tcon) {
- cifs_stats_fail_inc(tcon, SMB2_TREE_CONNECT_HE);
- tcon->need_reconnect = true;
- }
+ cifs_stats_fail_inc(tcon, SMB2_TREE_CONNECT_HE);
+ tcon->need_reconnect = true;
goto tcon_error_exit;
}
@@ -2899,7 +2892,10 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
#endif /* CIFS_DEBUG2 */
if (buf) {
- memcpy(buf, &rsp->CreationTime, 32);
+ buf->CreationTime = rsp->CreationTime;
+ buf->LastAccessTime = rsp->LastAccessTime;
+ buf->LastWriteTime = rsp->LastWriteTime;
+ buf->ChangeTime = rsp->ChangeTime;
buf->AllocationSize = rsp->AllocationSize;
buf->EndOfFile = rsp->EndofFile;
buf->Attributes = rsp->FileAttributes;
@@ -3477,6 +3473,8 @@ int SMB2_query_info(const unsigned int xid, struct cifs_tcon *tcon,
NULL);
}
+#if 0
+/* currently unused, as now we are doing compounding instead (see smb311_posix_query_path_info) */
int
SMB311_posix_query_info(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_fid, u64 volatile_fid, struct smb311_posix_qinfo *data, u32 *plen)
@@ -3488,7 +3486,9 @@ SMB311_posix_query_info(const unsigned int xid, struct cifs_tcon *tcon,
return query_info(xid, tcon, persistent_fid, volatile_fid,
SMB_FIND_FILE_POSIX_INFO, SMB2_O_INFO_FILE, 0,
output_len, sizeof(struct smb311_posix_qinfo), (void **)&data, plen);
+ /* Note caller must free "data" (passed in above). It may be allocated in query_info call */
}
+#endif
int
SMB2_query_acl(const unsigned int xid, struct cifs_tcon *tcon,
@@ -3900,10 +3900,10 @@ smb2_new_read_req(void **buf, unsigned int *total_len,
* Related requests use info from previous read request
* in chain.
*/
- shdr->SessionId = 0xFFFFFFFF;
+ shdr->SessionId = 0xFFFFFFFFFFFFFFFF;
shdr->TreeId = 0xFFFFFFFF;
- req->PersistentFileId = 0xFFFFFFFF;
- req->VolatileFileId = 0xFFFFFFFF;
+ req->PersistentFileId = 0xFFFFFFFFFFFFFFFF;
+ req->VolatileFileId = 0xFFFFFFFFFFFFFFFF;
}
}
if (remaining_bytes > io_parms->length)
@@ -4491,7 +4491,7 @@ int posix_info_parse(const void *beg, const void *end,
{
int total_len = 0;
- int sid_len;
+ int owner_len, group_len;
int name_len;
const void *owner_sid;
const void *group_sid;
@@ -4514,17 +4514,17 @@ int posix_info_parse(const void *beg, const void *end,
/* check owner sid */
owner_sid = beg + total_len;
- sid_len = posix_info_sid_size(owner_sid, end);
- if (sid_len < 0)
+ owner_len = posix_info_sid_size(owner_sid, end);
+ if (owner_len < 0)
return -1;
- total_len += sid_len;
+ total_len += owner_len;
/* check group sid */
group_sid = beg + total_len;
- sid_len = posix_info_sid_size(group_sid, end);
- if (sid_len < 0)
+ group_len = posix_info_sid_size(group_sid, end);
+ if (group_len < 0)
return -1;
- total_len += sid_len;
+ total_len += group_len;
/* check name len */
if (beg + total_len + 4 > end)
@@ -4545,10 +4545,8 @@ int posix_info_parse(const void *beg, const void *end,
out->size = total_len;
out->name_len = name_len;
out->name = name;
- memcpy(&out->owner, owner_sid,
- posix_info_sid_size(owner_sid, end));
- memcpy(&out->group, group_sid,
- posix_info_sid_size(group_sid, end));
+ memcpy(&out->owner, owner_sid, owner_len);
+ memcpy(&out->group, group_sid, group_len);
}
return total_len;
}
diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index 6442dc1c292b..a5c48b85549a 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: LGPL-2.1 */
/*
* fs/cifs/smb2pdu.h
*
@@ -6,19 +7,6 @@
* Author(s): Steve French (sfrench@us.ibm.com)
* Pavel Shilovsky (pshilovsky@samba.org) 2012
*
- * This library is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published
- * by the Free Software Foundation; either version 2.1 of the License, or
- * (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
- * the GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License
- * along with this library; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#ifndef _SMB2PDU_H
@@ -276,7 +264,7 @@ struct share_redirect_error_context_rsp {
__le32 NotificationType;
__le32 ResourceNameOffset;
__le32 ResourceNameLength;
- __le16 Flags;
+ __le16 Reserved;
__le16 TargetType;
__le32 IPAddrCount;
struct move_dst_ipaddr IpAddrMoveList[];
@@ -1460,6 +1448,22 @@ struct smb2_echo_rsp {
#define SMB2_QUERY_DIRECTORY_IOV_SIZE 2
+/*
+ * Valid FileInformation classes.
+ *
+ * Note that these are a subset of the (file) QUERY_INFO levels defined
+ * later in this file (but since QUERY_DIRECTORY uses equivalent numbers
+ * we do not redefine them here)
+ *
+ * FileDirectoryInfomation 0x01
+ * FileFullDirectoryInformation 0x02
+ * FileIdFullDirectoryInformation 0x26
+ * FileBothDirectoryInformation 0x03
+ * FileIdBothDirectoryInformation 0x25
+ * FileNamesInformation 0x0C
+ * FileIdExtdDirectoryInformation 0x3C
+ */
+
struct smb2_query_directory_req {
struct smb2_sync_hdr sync_hdr;
__le16 StructureSize; /* Must be 33 */
@@ -1696,6 +1700,7 @@ struct smb3_fs_vol_info {
#define FILEID_GLOBAL_TX_DIRECTORY_INFORMATION 50
#define FILE_STANDARD_LINK_INFORMATION 54
#define FILE_ID_INFORMATION 59
+#define FILE_ID_EXTD_DIRECTORY_INFORMATION 60
struct smb2_file_internal_info {
__le64 IndexNumber;
@@ -1776,13 +1781,31 @@ struct smb2_file_network_open_info {
__le32 Reserved;
} __packed; /* level 34 Query also similar returned in close rsp and open rsp */
-/* See MS-FSCC 2.4.43 */
+/* See MS-FSCC 2.4.21 */
struct smb2_file_id_information {
__le64 VolumeSerialNumber;
__u64 PersistentFileId; /* opaque endianness */
__u64 VolatileFileId; /* opaque endianness */
} __packed; /* level 59 */
+/* See MS-FSCC 2.4.18 */
+struct smb2_file_id_extd_directory_info {
+ __le32 NextEntryOffset;
+ __u32 FileIndex;
+ __le64 CreationTime;
+ __le64 LastAccessTime;
+ __le64 LastWriteTime;
+ __le64 ChangeTime;
+ __le64 EndOfFile;
+ __le64 AllocationSize;
+ __le32 FileAttributes;
+ __le32 FileNameLength;
+ __le32 EaSize; /* EA size */
+ __le32 ReparsePointTag; /* valid if FILE_ATTR_REPARSE_POINT set in FileAttributes */
+ __le64 UniqueId; /* inode num - le since Samba puts ino in low 32 bit */
+ char FileName[1];
+} __packed; /* level 60 */
+
extern char smb2_padding[7];
/* equivalent of the contents of SMB3.1.1 POSIX open context response */
diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h
index a5f87b02cfaf..263767f644f8 100644
--- a/fs/cifs/smb2proto.h
+++ b/fs/cifs/smb2proto.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: LGPL-2.1 */
/*
* fs/cifs/smb2proto.h
*
@@ -6,19 +7,6 @@
* Author(s): Steve French (sfrench@us.ibm.com)
* Pavel Shilovsky (pshilovsky@samba.org) 2012
*
- * This library is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published
- * by the Free Software Foundation; either version 2.1 of the License, or
- * (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
- * the GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License
- * along with this library; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#ifndef _SMB2PROTO_H
#define _SMB2PROTO_H
@@ -64,8 +52,6 @@ extern void smb2_echo_request(struct work_struct *work);
extern __le32 smb2_get_lease_state(struct cifsInodeInfo *cinode);
extern bool smb2_is_valid_oplock_break(char *buffer,
struct TCP_Server_Info *srv);
-extern struct cifs_ses *smb2_find_smb_ses(struct TCP_Server_Info *server,
- __u64 ses_id);
extern int smb3_handle_read_data(struct TCP_Server_Info *server,
struct mid_q_entry *mid);
diff --git a/fs/cifs/smb2status.h b/fs/cifs/smb2status.h
index 7505056e9580..0215ef36e240 100644
--- a/fs/cifs/smb2status.h
+++ b/fs/cifs/smb2status.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: LGPL-2.1 */
/*
* fs/cifs/smb2status.h
*
@@ -7,19 +8,6 @@
* Copyright (c) International Business Machines Corp., 2009,2011
* Author(s): Steve French (sfrench@us.ibm.com)
*
- * This library is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published
- * by the Free Software Foundation; either version 2.1 of the License, or
- * (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
- * the GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License
- * along with this library; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
/*
diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c
index e6fa76ab70be..6f7952ea4941 100644
--- a/fs/cifs/smb2transport.c
+++ b/fs/cifs/smb2transport.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: LGPL-2.1
/*
* fs/cifs/smb2transport.c
*
@@ -7,19 +8,6 @@
* Jeremy Allison (jra@samba.org) 2006
* Pavel Shilovsky (pshilovsky@samba.org) 2012
*
- * This library is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published
- * by the Free Software Foundation; either version 2.1 of the License, or
- * (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
- * the GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License
- * along with this library; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <linux/fs.h>
@@ -154,6 +142,7 @@ smb2_find_smb_ses_unlocked(struct TCP_Server_Info *server, __u64 ses_id)
list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
if (ses->Suid != ses_id)
continue;
+ ++ses->ses_count;
return ses;
}
@@ -205,7 +194,14 @@ smb2_find_smb_tcon(struct TCP_Server_Info *server, __u64 ses_id, __u32 tid)
return NULL;
}
tcon = smb2_find_smb_sess_tcon_unlocked(ses, tid);
+ if (!tcon) {
+ cifs_put_smb_ses(ses);
+ spin_unlock(&cifs_tcp_ses_lock);
+ return NULL;
+ }
spin_unlock(&cifs_tcp_ses_lock);
+ /* tcon already has a ref to ses, so we don't need ses anymore */
+ cifs_put_smb_ses(ses);
return tcon;
}
@@ -239,7 +235,7 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server,
if (rc) {
cifs_server_dbg(VFS,
"%s: sha256 alloc failed\n", __func__);
- return rc;
+ goto out;
}
shash = &sdesc->shash;
} else {
@@ -290,6 +286,8 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server,
out:
if (allocate_crypto)
cifs_free_hash(&hash, &sdesc);
+ if (ses)
+ cifs_put_smb_ses(ses);
return rc;
}
diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c
index 10dfe5006792..31ef64eb7fbb 100644
--- a/fs/cifs/smbdirect.c
+++ b/fs/cifs/smbdirect.c
@@ -572,8 +572,13 @@ static struct rdma_cm_id *smbd_create_id(
log_rdma_event(ERR, "rdma_resolve_addr() failed %i\n", rc);
goto out;
}
- wait_for_completion_interruptible_timeout(
+ rc = wait_for_completion_interruptible_timeout(
&info->ri_done, msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT));
+ /* e.g. if interrupted returns -ERESTARTSYS */
+ if (rc < 0) {
+ log_rdma_event(ERR, "rdma_resolve_addr timeout rc: %i\n", rc);
+ goto out;
+ }
rc = info->ri_rc;
if (rc) {
log_rdma_event(ERR, "rdma_resolve_addr() completed %i\n", rc);
@@ -586,8 +591,13 @@ static struct rdma_cm_id *smbd_create_id(
log_rdma_event(ERR, "rdma_resolve_route() failed %i\n", rc);
goto out;
}
- wait_for_completion_interruptible_timeout(
+ rc = wait_for_completion_interruptible_timeout(
&info->ri_done, msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT));
+ /* e.g. if interrupted returns -ERESTARTSYS */
+ if (rc < 0) {
+ log_rdma_event(ERR, "rdma_resolve_addr timeout rc: %i\n", rc);
+ goto out;
+ }
rc = info->ri_rc;
if (rc) {
log_rdma_event(ERR, "rdma_resolve_route() completed %i\n", rc);
diff --git a/fs/cifs/smberr.h b/fs/cifs/smberr.h
index 7f16cb825fe5..60189efb3236 100644
--- a/fs/cifs/smberr.h
+++ b/fs/cifs/smberr.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: LGPL-2.1 */
/*
* fs/cifs/smberr.h
*
@@ -7,19 +8,6 @@
* See Error Codes section of the SNIA CIFS Specification
* for more information
*
- * This library is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published
- * by the Free Software Foundation; either version 2.1 of the License, or
- * (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
- * the GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License
- * along with this library; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#define SUCCESS 0x00 /* The request was successful. */
diff --git a/fs/cifs/smbfsctl.h b/fs/cifs/smbfsctl.h
index a0e84747f567..d0fc42061f49 100644
--- a/fs/cifs/smbfsctl.h
+++ b/fs/cifs/smbfsctl.h
@@ -1,22 +1,10 @@
+/* SPDX-License-Identifier: LGPL-2.1 */
/*
* fs/cifs/smbfsctl.h: SMB, CIFS, SMB2 FSCTL definitions
*
* Copyright (c) International Business Machines Corp., 2002,2013
* Author(s): Steve French (sfrench@us.ibm.com)
*
- * This library is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published
- * by the Free Software Foundation; either version 2.1 of the License, or
- * (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
- * the GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License
- * along with this library; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
/* IOCTL information */
diff --git a/fs/cifs/trace.h b/fs/cifs/trace.h
index d6df908dccad..dafcb6ab050d 100644
--- a/fs/cifs/trace.h
+++ b/fs/cifs/trace.h
@@ -12,6 +12,11 @@
#include <linux/tracepoint.h>
+/*
+ * Please use this 3-part article as a reference for writing new tracepoints:
+ * https://lwn.net/Articles/379903/
+ */
+
/* For logging errors in read or write */
DECLARE_EVENT_CLASS(smb3_rw_err_class,
TP_PROTO(unsigned int xid,
@@ -529,16 +534,16 @@ DECLARE_EVENT_CLASS(smb3_exit_err_class,
TP_ARGS(xid, func_name, rc),
TP_STRUCT__entry(
__field(unsigned int, xid)
- __field(const char *, func_name)
+ __string(func_name, func_name)
__field(int, rc)
),
TP_fast_assign(
__entry->xid = xid;
- __entry->func_name = func_name;
+ __assign_str(func_name, func_name);
__entry->rc = rc;
),
TP_printk("\t%s: xid=%u rc=%d",
- __entry->func_name, __entry->xid, __entry->rc)
+ __get_str(func_name), __entry->xid, __entry->rc)
)
#define DEFINE_SMB3_EXIT_ERR_EVENT(name) \
@@ -583,14 +588,14 @@ DECLARE_EVENT_CLASS(smb3_enter_exit_class,
TP_ARGS(xid, func_name),
TP_STRUCT__entry(
__field(unsigned int, xid)
- __field(const char *, func_name)
+ __string(func_name, func_name)
),
TP_fast_assign(
__entry->xid = xid;
- __entry->func_name = func_name;
+ __assign_str(func_name, func_name);
),
TP_printk("\t%s: xid=%u",
- __entry->func_name, __entry->xid)
+ __get_str(func_name), __entry->xid)
)
#define DEFINE_SMB3_ENTER_EXIT_EVENT(name) \
@@ -857,16 +862,16 @@ DECLARE_EVENT_CLASS(smb3_reconnect_class,
TP_STRUCT__entry(
__field(__u64, currmid)
__field(__u64, conn_id)
- __field(char *, hostname)
+ __string(hostname, hostname)
),
TP_fast_assign(
__entry->currmid = currmid;
__entry->conn_id = conn_id;
- __entry->hostname = hostname;
+ __assign_str(hostname, hostname);
),
TP_printk("conn_id=0x%llx server=%s current_mid=%llu",
__entry->conn_id,
- __entry->hostname,
+ __get_str(hostname),
__entry->currmid)
)
@@ -891,7 +896,7 @@ DECLARE_EVENT_CLASS(smb3_credit_class,
TP_STRUCT__entry(
__field(__u64, currmid)
__field(__u64, conn_id)
- __field(char *, hostname)
+ __string(hostname, hostname)
__field(int, credits)
__field(int, credits_to_add)
__field(int, in_flight)
@@ -899,7 +904,7 @@ DECLARE_EVENT_CLASS(smb3_credit_class,
TP_fast_assign(
__entry->currmid = currmid;
__entry->conn_id = conn_id;
- __entry->hostname = hostname;
+ __assign_str(hostname, hostname);
__entry->credits = credits;
__entry->credits_to_add = credits_to_add;
__entry->in_flight = in_flight;
@@ -907,7 +912,7 @@ DECLARE_EVENT_CLASS(smb3_credit_class,
TP_printk("conn_id=0x%llx server=%s current_mid=%llu "
"credits=%d credit_change=%d in_flight=%d",
__entry->conn_id,
- __entry->hostname,
+ __get_str(hostname),
__entry->currmid,
__entry->credits,
__entry->credits_to_add,
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index c1725b55f364..f65f9a692ca2 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: LGPL-2.1
/*
* fs/cifs/transport.c
*
@@ -5,19 +6,6 @@
* Author(s): Steve French (sfrench@us.ibm.com)
* Jeremy Allison (jra@samba.org) 2006.
*
- * This library is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published
- * by the Free Software Foundation; either version 2.1 of the License, or
- * (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
- * the GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License
- * along with this library; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <linux/fs.h>
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index aa3e8ca0457c..9ed481e79ce0 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -1,22 +1,10 @@
+// SPDX-License-Identifier: LGPL-2.1
/*
* fs/cifs/xattr.c
*
* Copyright (c) International Business Machines Corp., 2003, 2007
* Author(s): Steve French (sfrench@us.ibm.com)
*
- * This library is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published
- * by the Free Software Foundation; either version 2.1 of the License, or
- * (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
- * the GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License
- * along with this library; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <linux/fs.h>
diff --git a/fs/configfs/file.c b/fs/configfs/file.c
index e26060dae70a..2f63bf3a7325 100644
--- a/fs/configfs/file.c
+++ b/fs/configfs/file.c
@@ -14,7 +14,7 @@
#include <linux/mutex.h>
#include <linux/vmalloc.h>
#include <linux/uaccess.h>
-
+#include <linux/uio.h>
#include <linux/configfs.h>
#include "configfs_internal.h"
@@ -77,28 +77,9 @@ static int fill_read_buffer(struct file *file, struct configfs_buffer *buffer)
return 0;
}
-/**
- * configfs_read_file - read an attribute.
- * @file: file pointer.
- * @buf: buffer to fill.
- * @count: number of bytes to read.
- * @ppos: starting offset in file.
- *
- * Userspace wants to read an attribute file. The attribute descriptor
- * is in the file's ->d_fsdata. The target item is in the directory's
- * ->d_fsdata.
- *
- * We call fill_read_buffer() to allocate and fill the buffer from the
- * item's show() method exactly once (if the read is happening from
- * the beginning of the file). That should fill the entire buffer with
- * all the data the item has to offer for that attribute.
- * We then call flush_read_buffer() to copy the buffer to userspace
- * in the increments specified.
- */
-
-static ssize_t
-configfs_read_file(struct file *file, char __user *buf, size_t count, loff_t *ppos)
+static ssize_t configfs_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
+ struct file *file = iocb->ki_filp;
struct configfs_buffer *buffer = file->private_data;
ssize_t retval = 0;
@@ -108,43 +89,24 @@ configfs_read_file(struct file *file, char __user *buf, size_t count, loff_t *pp
if (retval)
goto out;
}
- pr_debug("%s: count = %zd, ppos = %lld, buf = %s\n",
- __func__, count, *ppos, buffer->page);
- retval = simple_read_from_buffer(buf, count, ppos, buffer->page,
- buffer->count);
+ pr_debug("%s: count = %zd, pos = %lld, buf = %s\n",
+ __func__, iov_iter_count(to), iocb->ki_pos, buffer->page);
+ retval = copy_to_iter(buffer->page, buffer->count, to);
+ iocb->ki_pos += retval;
+ if (retval == 0)
+ retval = -EFAULT;
out:
mutex_unlock(&buffer->mutex);
return retval;
}
-/**
- * configfs_read_bin_file - read a binary attribute.
- * @file: file pointer.
- * @buf: buffer to fill.
- * @count: number of bytes to read.
- * @ppos: starting offset in file.
- *
- * Userspace wants to read a binary attribute file. The attribute
- * descriptor is in the file's ->d_fsdata. The target item is in the
- * directory's ->d_fsdata.
- *
- * We check whether we need to refill the buffer. If so we will
- * call the attributes' attr->read() twice. The first time we
- * will pass a NULL as a buffer pointer, which the attributes' method
- * will use to return the size of the buffer required. If no error
- * occurs we will allocate the buffer using vmalloc and call
- * attr->read() again passing that buffer as an argument.
- * Then we just copy to user-space using simple_read_from_buffer.
- */
-
-static ssize_t
-configfs_read_bin_file(struct file *file, char __user *buf,
- size_t count, loff_t *ppos)
+static ssize_t configfs_bin_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
+ struct file *file = iocb->ki_filp;
struct configfs_fragment *frag = to_frag(file);
struct configfs_buffer *buffer = file->private_data;
ssize_t retval = 0;
- ssize_t len = min_t(size_t, count, PAGE_SIZE);
+ ssize_t len;
mutex_lock(&buffer->mutex);
@@ -200,42 +162,31 @@ configfs_read_bin_file(struct file *file, char __user *buf,
buffer->needs_read_fill = 0;
}
- retval = simple_read_from_buffer(buf, count, ppos, buffer->bin_buffer,
- buffer->bin_buffer_size);
+ retval = copy_to_iter(buffer->bin_buffer, buffer->bin_buffer_size, to);
+ iocb->ki_pos += retval;
+ if (retval == 0)
+ retval = -EFAULT;
out:
mutex_unlock(&buffer->mutex);
return retval;
}
-
-/**
- * fill_write_buffer - copy buffer from userspace.
- * @buffer: data buffer for file.
- * @buf: data from user.
- * @count: number of bytes in @userbuf.
- *
- * Allocate @buffer->page if it hasn't been already, then
- * copy the user-supplied buffer into it.
- */
-
-static int
-fill_write_buffer(struct configfs_buffer * buffer, const char __user * buf, size_t count)
+static int fill_write_buffer(struct configfs_buffer *buffer,
+ struct iov_iter *from)
{
- int error;
+ int copied;
if (!buffer->page)
buffer->page = (char *)__get_free_pages(GFP_KERNEL, 0);
if (!buffer->page)
return -ENOMEM;
- if (count >= SIMPLE_ATTR_SIZE)
- count = SIMPLE_ATTR_SIZE - 1;
- error = copy_from_user(buffer->page,buf,count);
+ copied = copy_from_iter(buffer->page, SIMPLE_ATTR_SIZE - 1, from);
buffer->needs_read_fill = 1;
/* if buf is assumed to contain a string, terminate it by \0,
* so e.g. sscanf() can scan the string easily */
- buffer->page[count] = 0;
- return error ? -EFAULT : count;
+ buffer->page[copied] = 0;
+ return copied ? : -EFAULT;
}
static int
@@ -252,58 +203,36 @@ flush_write_buffer(struct file *file, struct configfs_buffer *buffer, size_t cou
}
-/**
- * configfs_write_file - write an attribute.
- * @file: file pointer
- * @buf: data to write
- * @count: number of bytes
- * @ppos: starting offset
- *
- * Similar to configfs_read_file(), though working in the opposite direction.
- * We allocate and fill the data from the user in fill_write_buffer(),
- * then push it to the config_item in flush_write_buffer().
- * There is no easy way for us to know if userspace is only doing a partial
- * write, so we don't support them. We expect the entire buffer to come
- * on the first write.
- * Hint: if you're writing a value, first read the file, modify only
- * the value you're changing, then write entire buffer back.
+/*
+ * There is no easy way for us to know if userspace is only doing a partial
+ * write, so we don't support them. We expect the entire buffer to come on the
+ * first write.
+ * Hint: if you're writing a value, first read the file, modify only the value
+ * you're changing, then write entire buffer back.
*/
-
-static ssize_t
-configfs_write_file(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
+static ssize_t configfs_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
+ struct file *file = iocb->ki_filp;
struct configfs_buffer *buffer = file->private_data;
ssize_t len;
mutex_lock(&buffer->mutex);
- len = fill_write_buffer(buffer, buf, count);
+ len = fill_write_buffer(buffer, from);
if (len > 0)
len = flush_write_buffer(file, buffer, len);
if (len > 0)
- *ppos += len;
+ iocb->ki_pos += len;
mutex_unlock(&buffer->mutex);
return len;
}
-/**
- * configfs_write_bin_file - write a binary attribute.
- * @file: file pointer
- * @buf: data to write
- * @count: number of bytes
- * @ppos: starting offset
- *
- * Writing to a binary attribute file is similar to a normal read.
- * We buffer the consecutive writes (binary attribute files do not
- * support lseek) in a continuously growing buffer, but we don't
- * commit until the close of the file.
- */
-
-static ssize_t
-configfs_write_bin_file(struct file *file, const char __user *buf,
- size_t count, loff_t *ppos)
+static ssize_t configfs_bin_write_iter(struct kiocb *iocb,
+ struct iov_iter *from)
{
+ struct file *file = iocb->ki_filp;
struct configfs_buffer *buffer = file->private_data;
void *tbuf = NULL;
+ size_t end_offset;
ssize_t len;
mutex_lock(&buffer->mutex);
@@ -316,15 +245,14 @@ configfs_write_bin_file(struct file *file, const char __user *buf,
buffer->write_in_progress = true;
/* buffer grows? */
- if (*ppos + count > buffer->bin_buffer_size) {
-
- if (buffer->cb_max_size &&
- *ppos + count > buffer->cb_max_size) {
+ end_offset = iocb->ki_pos + iov_iter_count(from);
+ if (end_offset > buffer->bin_buffer_size) {
+ if (buffer->cb_max_size && end_offset > buffer->cb_max_size) {
len = -EFBIG;
goto out;
}
- tbuf = vmalloc(*ppos + count);
+ tbuf = vmalloc(end_offset);
if (tbuf == NULL) {
len = -ENOMEM;
goto out;
@@ -339,16 +267,15 @@ configfs_write_bin_file(struct file *file, const char __user *buf,
/* clear the new area */
memset(tbuf + buffer->bin_buffer_size, 0,
- *ppos + count - buffer->bin_buffer_size);
+ end_offset - buffer->bin_buffer_size);
buffer->bin_buffer = tbuf;
- buffer->bin_buffer_size = *ppos + count;
+ buffer->bin_buffer_size = end_offset;
}
- len = simple_write_to_buffer(buffer->bin_buffer,
- buffer->bin_buffer_size, ppos, buf, count);
+ len = copy_from_iter(buffer->bin_buffer, buffer->bin_buffer_size, from);
out:
mutex_unlock(&buffer->mutex);
- return len;
+ return len ? : -EFAULT;
}
static int __configfs_open_file(struct inode *inode, struct file *file, int type)
@@ -466,11 +393,8 @@ static int configfs_release_bin_file(struct inode *inode, struct file *file)
{
struct configfs_buffer *buffer = file->private_data;
- buffer->read_in_progress = false;
-
if (buffer->write_in_progress) {
struct configfs_fragment *frag = to_frag(file);
- buffer->write_in_progress = false;
down_read(&frag->frag_sem);
if (!frag->frag_dead) {
@@ -480,29 +404,26 @@ static int configfs_release_bin_file(struct inode *inode, struct file *file)
buffer->bin_buffer_size);
}
up_read(&frag->frag_sem);
- /* vfree on NULL is safe */
- vfree(buffer->bin_buffer);
- buffer->bin_buffer = NULL;
- buffer->bin_buffer_size = 0;
- buffer->needs_read_fill = 1;
}
+ vfree(buffer->bin_buffer);
+
configfs_release(inode, file);
return 0;
}
const struct file_operations configfs_file_operations = {
- .read = configfs_read_file,
- .write = configfs_write_file,
+ .read_iter = configfs_read_iter,
+ .write_iter = configfs_write_iter,
.llseek = generic_file_llseek,
.open = configfs_open_file,
.release = configfs_release,
};
const struct file_operations configfs_bin_file_operations = {
- .read = configfs_read_bin_file,
- .write = configfs_write_bin_file,
+ .read_iter = configfs_bin_read_iter,
+ .write_iter = configfs_bin_write_iter,
.llseek = NULL, /* bin file is not seekable */
.open = configfs_open_bin_file,
.release = configfs_release_bin_file,
@@ -532,7 +453,7 @@ int configfs_create_file(struct config_item * item, const struct configfs_attrib
/**
* configfs_create_bin_file - create a binary attribute file for an item.
* @item: item we're creating for.
- * @attr: atrribute descriptor.
+ * @bin_attr: atrribute descriptor.
*/
int configfs_create_bin_file(struct config_item *item,
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index eb5ec3e46283..b601610e9907 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -28,12 +28,6 @@
static struct lock_class_key default_group_class[MAX_LOCK_DEPTH];
#endif
-static const struct address_space_operations configfs_aops = {
- .readpage = simple_readpage,
- .write_begin = simple_write_begin,
- .write_end = simple_write_end,
-};
-
static const struct inode_operations configfs_inode_operations ={
.setattr = configfs_setattr,
};
@@ -114,7 +108,7 @@ struct inode *configfs_new_inode(umode_t mode, struct configfs_dirent *sd,
struct inode * inode = new_inode(s);
if (inode) {
inode->i_ino = get_next_ino();
- inode->i_mapping->a_ops = &configfs_aops;
+ inode->i_mapping->a_ops = &ram_aops;
inode->i_op = &configfs_inode_operations;
if (sd->s_iattr) {
diff --git a/fs/coredump.c b/fs/coredump.c
index 2868e3e171ae..c3d8fc14b993 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -519,7 +519,7 @@ static bool dump_interrupted(void)
* but then we need to teach dump_write() to restart and clear
* TIF_SIGPENDING.
*/
- return signal_pending(current);
+ return fatal_signal_pending(current) || freezing(current);
}
static void wait_for_dump_helpers(struct file *file)
diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c
index 6ca7d16593ff..d00455440d08 100644
--- a/fs/crypto/fname.c
+++ b/fs/crypto/fname.c
@@ -344,13 +344,9 @@ int fscrypt_fname_disk_to_usr(const struct inode *inode,
offsetof(struct fscrypt_nokey_name, sha256));
BUILD_BUG_ON(BASE64_CHARS(FSCRYPT_NOKEY_NAME_MAX) > NAME_MAX);
- if (hash) {
- nokey_name.dirhash[0] = hash;
- nokey_name.dirhash[1] = minor_hash;
- } else {
- nokey_name.dirhash[0] = 0;
- nokey_name.dirhash[1] = 0;
- }
+ nokey_name.dirhash[0] = hash;
+ nokey_name.dirhash[1] = minor_hash;
+
if (iname->len <= sizeof(nokey_name.bytes)) {
memcpy(nokey_name.bytes, iname->name, iname->len);
size = offsetof(struct fscrypt_nokey_name, bytes[iname->len]);
diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c
index 261293fb7097..bca9c6658a7c 100644
--- a/fs/crypto/keysetup.c
+++ b/fs/crypto/keysetup.c
@@ -210,15 +210,40 @@ out_unlock:
return err;
}
+/*
+ * Derive a SipHash key from the given fscrypt master key and the given
+ * application-specific information string.
+ *
+ * Note that the KDF produces a byte array, but the SipHash APIs expect the key
+ * as a pair of 64-bit words. Therefore, on big endian CPUs we have to do an
+ * endianness swap in order to get the same results as on little endian CPUs.
+ */
+static int fscrypt_derive_siphash_key(const struct fscrypt_master_key *mk,
+ u8 context, const u8 *info,
+ unsigned int infolen, siphash_key_t *key)
+{
+ int err;
+
+ err = fscrypt_hkdf_expand(&mk->mk_secret.hkdf, context, info, infolen,
+ (u8 *)key, sizeof(*key));
+ if (err)
+ return err;
+
+ BUILD_BUG_ON(sizeof(*key) != 16);
+ BUILD_BUG_ON(ARRAY_SIZE(key->key) != 2);
+ le64_to_cpus(&key->key[0]);
+ le64_to_cpus(&key->key[1]);
+ return 0;
+}
+
int fscrypt_derive_dirhash_key(struct fscrypt_info *ci,
const struct fscrypt_master_key *mk)
{
int err;
- err = fscrypt_hkdf_expand(&mk->mk_secret.hkdf, HKDF_CONTEXT_DIRHASH_KEY,
- ci->ci_nonce, FSCRYPT_FILE_NONCE_SIZE,
- (u8 *)&ci->ci_dirhash_key,
- sizeof(ci->ci_dirhash_key));
+ err = fscrypt_derive_siphash_key(mk, HKDF_CONTEXT_DIRHASH_KEY,
+ ci->ci_nonce, FSCRYPT_FILE_NONCE_SIZE,
+ &ci->ci_dirhash_key);
if (err)
return err;
ci->ci_dirhash_key_initialized = true;
@@ -253,10 +278,9 @@ static int fscrypt_setup_iv_ino_lblk_32_key(struct fscrypt_info *ci,
if (mk->mk_ino_hash_key_initialized)
goto unlock;
- err = fscrypt_hkdf_expand(&mk->mk_secret.hkdf,
- HKDF_CONTEXT_INODE_HASH_KEY, NULL, 0,
- (u8 *)&mk->mk_ino_hash_key,
- sizeof(mk->mk_ino_hash_key));
+ err = fscrypt_derive_siphash_key(mk,
+ HKDF_CONTEXT_INODE_HASH_KEY,
+ NULL, 0, &mk->mk_ino_hash_key);
if (err)
goto unlock;
/* pairs with smp_load_acquire() above */
diff --git a/fs/dax.c b/fs/dax.c
index 62352cbcf0f4..da41f9363568 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -488,10 +488,11 @@ static void *grab_mapping_entry(struct xa_state *xas,
struct address_space *mapping, unsigned int order)
{
unsigned long index = xas->xa_index;
- bool pmd_downgrade = false; /* splitting PMD entry into PTE entries? */
+ bool pmd_downgrade; /* splitting PMD entry into PTE entries? */
void *entry;
retry:
+ pmd_downgrade = false;
xas_lock_irq(xas);
entry = get_unlocked_entry(xas, order);
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index e813acfaa6e8..ba7c01cd9a5d 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -893,7 +893,7 @@ ssize_t debugfs_read_file_str(struct file *file, char __user *user_buf,
copy[copy_len] = '\n';
- ret = simple_read_from_buffer(user_buf, count, ppos, copy, copy_len);
+ ret = simple_read_from_buffer(user_buf, count, ppos, copy, len);
kfree(copy);
return ret;
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 1d252164d97b..8129a430d789 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -45,10 +45,13 @@ static unsigned int debugfs_allow __ro_after_init = DEFAULT_DEBUGFS_ALLOW_BITS;
static int debugfs_setattr(struct user_namespace *mnt_userns,
struct dentry *dentry, struct iattr *ia)
{
- int ret = security_locked_down(LOCKDOWN_DEBUGFS);
+ int ret;
- if (ret && (ia->ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID)))
- return ret;
+ if (ia->ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID)) {
+ ret = security_locked_down(LOCKDOWN_DEBUGFS);
+ if (ret)
+ return ret;
+ }
return simple_setattr(&init_user_ns, dentry, ia);
}
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index 88d95d96e36c..42eee2783756 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -20,6 +20,7 @@
#include <net/sock.h>
#include "config.h"
+#include "midcomms.h"
#include "lowcomms.h"
/*
@@ -79,6 +80,9 @@ struct dlm_cluster {
unsigned int cl_new_rsb_count;
unsigned int cl_recover_callbacks;
char cl_cluster_name[DLM_LOCKSPACE_LEN];
+
+ struct dlm_spaces *sps;
+ struct dlm_comms *cms;
};
static struct dlm_cluster *config_item_to_cluster(struct config_item *i)
@@ -204,7 +208,7 @@ static int dlm_check_zero(unsigned int x)
static int dlm_check_buffer_size(unsigned int x)
{
- if (x < DEFAULT_BUFFER_SIZE)
+ if (x < DLM_MAX_SOCKET_BUFSIZE)
return -EINVAL;
return 0;
@@ -409,6 +413,9 @@ static struct config_group *make_cluster(struct config_group *g,
if (!cl || !sps || !cms)
goto fail;
+ cl->sps = sps;
+ cl->cms = cms;
+
config_group_init_type_name(&cl->group, name, &cluster_type);
config_group_init_type_name(&sps->ss_group, "spaces", &spaces_type);
config_group_init_type_name(&cms->cs_group, "comms", &comms_type);
@@ -458,6 +465,9 @@ static void drop_cluster(struct config_group *g, struct config_item *i)
static void release_cluster(struct config_item *i)
{
struct dlm_cluster *cl = config_item_to_cluster(i);
+
+ kfree(cl->sps);
+ kfree(cl->cms);
kfree(cl);
}
@@ -532,7 +542,7 @@ static void drop_comm(struct config_group *g, struct config_item *i)
struct dlm_comm *cm = config_item_to_comm(i);
if (local_comm == cm)
local_comm = NULL;
- dlm_lowcomms_close(cm->nodeid);
+ dlm_midcomms_close(cm->nodeid);
while (cm->addr_count--)
kfree(cm->addr[cm->addr_count]);
config_item_put(i);
@@ -942,7 +952,7 @@ int dlm_our_addr(struct sockaddr_storage *addr, int num)
#define DEFAULT_SCAN_SECS 5
#define DEFAULT_LOG_DEBUG 0
#define DEFAULT_LOG_INFO 1
-#define DEFAULT_PROTOCOL 0
+#define DEFAULT_PROTOCOL DLM_PROTO_TCP
#define DEFAULT_MARK 0
#define DEFAULT_TIMEWARN_CS 500 /* 5 sec = 500 centiseconds */
#define DEFAULT_WAITWARN_US 0
@@ -952,7 +962,7 @@ int dlm_our_addr(struct sockaddr_storage *addr, int num)
struct dlm_config_info dlm_config = {
.ci_tcp_port = DEFAULT_TCP_PORT,
- .ci_buffer_size = DEFAULT_BUFFER_SIZE,
+ .ci_buffer_size = DLM_MAX_SOCKET_BUFSIZE,
.ci_rsbtbl_size = DEFAULT_RSBTBL_SIZE,
.ci_recover_timer = DEFAULT_RECOVER_TIMER,
.ci_toss_secs = DEFAULT_TOSS_SECS,
diff --git a/fs/dlm/config.h b/fs/dlm/config.h
index d2cd4bd20313..df92b0a07fc6 100644
--- a/fs/dlm/config.h
+++ b/fs/dlm/config.h
@@ -12,7 +12,7 @@
#ifndef __CONFIG_DOT_H__
#define __CONFIG_DOT_H__
-#define DEFAULT_BUFFER_SIZE 4096
+#define DLM_MAX_SOCKET_BUFSIZE 4096
struct dlm_config_node {
int nodeid;
@@ -23,6 +23,9 @@ struct dlm_config_node {
#define DLM_MAX_ADDR_COUNT 3
+#define DLM_PROTO_TCP 0
+#define DLM_PROTO_SCTP 1
+
struct dlm_config_info {
int ci_tcp_port;
int ci_buffer_size;
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index d5bd990bcab8..47e9d57e4cae 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c
@@ -16,6 +16,7 @@
#include <linux/slab.h>
#include "dlm_internal.h"
+#include "midcomms.h"
#include "lock.h"
#define DLM_DEBUG_BUF_LEN 4096
@@ -23,6 +24,7 @@ static char debug_buf[DLM_DEBUG_BUF_LEN];
static struct mutex debug_buf_lock;
static struct dentry *dlm_root;
+static struct dentry *dlm_comms;
static char *print_lockmode(int mode)
{
@@ -738,6 +740,57 @@ void dlm_delete_debug_file(struct dlm_ls *ls)
debugfs_remove(ls->ls_debug_toss_dentry);
}
+static int dlm_state_show(struct seq_file *file, void *offset)
+{
+ seq_printf(file, "%s\n", dlm_midcomms_state(file->private));
+ return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(dlm_state);
+
+static int dlm_flags_show(struct seq_file *file, void *offset)
+{
+ seq_printf(file, "%lu\n", dlm_midcomms_flags(file->private));
+ return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(dlm_flags);
+
+static int dlm_send_queue_cnt_show(struct seq_file *file, void *offset)
+{
+ seq_printf(file, "%d\n", dlm_midcomms_send_queue_cnt(file->private));
+ return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(dlm_send_queue_cnt);
+
+static int dlm_version_show(struct seq_file *file, void *offset)
+{
+ seq_printf(file, "0x%08x\n", dlm_midcomms_version(file->private));
+ return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(dlm_version);
+
+void *dlm_create_debug_comms_file(int nodeid, void *data)
+{
+ struct dentry *d_node;
+ char name[256];
+
+ memset(name, 0, sizeof(name));
+ snprintf(name, 256, "%d", nodeid);
+
+ d_node = debugfs_create_dir(name, dlm_comms);
+ debugfs_create_file("state", 0444, d_node, data, &dlm_state_fops);
+ debugfs_create_file("flags", 0444, d_node, data, &dlm_flags_fops);
+ debugfs_create_file("send_queue_count", 0444, d_node, data,
+ &dlm_send_queue_cnt_fops);
+ debugfs_create_file("version", 0444, d_node, data, &dlm_version_fops);
+
+ return d_node;
+}
+
+void dlm_delete_debug_comms_file(void *ctx)
+{
+ debugfs_remove(ctx);
+}
+
void dlm_create_debug_file(struct dlm_ls *ls)
{
char name[DLM_LOCKSPACE_LEN + 8];
@@ -797,6 +850,7 @@ void __init dlm_register_debugfs(void)
{
mutex_init(&debug_buf_lock);
dlm_root = debugfs_create_dir("dlm", NULL);
+ dlm_comms = debugfs_create_dir("comms", dlm_root);
}
void dlm_unregister_debugfs(void)
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index 04fe9f525ac7..91d1ca3a121a 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -57,9 +57,12 @@ struct dlm_header;
struct dlm_message;
struct dlm_rcom;
struct dlm_mhandle;
+struct dlm_msg;
#define log_print(fmt, args...) \
printk(KERN_ERR "dlm: "fmt"\n" , ##args)
+#define log_print_ratelimited(fmt, args...) \
+ printk_ratelimited(KERN_ERR "dlm: "fmt"\n", ##args)
#define log_error(ls, fmt, args...) \
printk(KERN_ERR "dlm: %s: " fmt "\n", (ls)->ls_name , ##args)
@@ -368,23 +371,33 @@ static inline int rsb_flag(struct dlm_rsb *r, enum rsb_flags flag)
/* dlm_header is first element of all structs sent between nodes */
#define DLM_HEADER_MAJOR 0x00030000
-#define DLM_HEADER_MINOR 0x00000001
+#define DLM_HEADER_MINOR 0x00000002
+
+#define DLM_VERSION_3_1 0x00030001
+#define DLM_VERSION_3_2 0x00030002
#define DLM_HEADER_SLOTS 0x00000001
#define DLM_MSG 1
#define DLM_RCOM 2
+#define DLM_OPTS 3
+#define DLM_ACK 4
+#define DLM_FIN 5
struct dlm_header {
uint32_t h_version;
- uint32_t h_lockspace;
+ union {
+ /* for DLM_MSG and DLM_RCOM */
+ uint32_t h_lockspace;
+ /* for DLM_ACK and DLM_OPTS */
+ uint32_t h_seq;
+ } u;
uint32_t h_nodeid; /* nodeid of sender */
uint16_t h_length;
uint8_t h_cmd; /* DLM_MSG, DLM_RCOM */
uint8_t h_pad;
};
-
#define DLM_MSG_REQUEST 1
#define DLM_MSG_CONVERT 2
#define DLM_MSG_UNLOCK 3
@@ -452,10 +465,29 @@ struct dlm_rcom {
char rc_buf[];
};
+struct dlm_opt_header {
+ uint16_t t_type;
+ uint16_t t_length;
+ uint32_t o_pad;
+ /* need to be 8 byte aligned */
+ char t_value[];
+};
+
+/* encapsulation header */
+struct dlm_opts {
+ struct dlm_header o_header;
+ uint8_t o_nextcmd;
+ uint8_t o_pad;
+ uint16_t o_optlen;
+ uint32_t o_pad2;
+ char o_opts[];
+};
+
union dlm_packet {
struct dlm_header header; /* common to other two */
struct dlm_message message;
struct dlm_rcom rcom;
+ struct dlm_opts opts;
};
#define DLM_RSF_NEED_SLOTS 0x00000001
@@ -722,11 +754,15 @@ void dlm_register_debugfs(void);
void dlm_unregister_debugfs(void);
void dlm_create_debug_file(struct dlm_ls *ls);
void dlm_delete_debug_file(struct dlm_ls *ls);
+void *dlm_create_debug_comms_file(int nodeid, void *data);
+void dlm_delete_debug_comms_file(void *ctx);
#else
static inline void dlm_register_debugfs(void) { }
static inline void dlm_unregister_debugfs(void) { }
static inline void dlm_create_debug_file(struct dlm_ls *ls) { }
static inline void dlm_delete_debug_file(struct dlm_ls *ls) { }
+static inline void *dlm_create_debug_comms_file(int nodeid, void *data) { return NULL; }
+static inline void dlm_delete_debug_comms_file(void *ctx) { }
#endif
#endif /* __DLM_INTERNAL_DOT_H__ */
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index b93df39d0915..c502c065d007 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -59,7 +59,7 @@
#include "dlm_internal.h"
#include <linux/dlm_device.h>
#include "memory.h"
-#include "lowcomms.h"
+#include "midcomms.h"
#include "requestqueue.h"
#include "util.h"
#include "dir.h"
@@ -3534,17 +3534,17 @@ static int _create_message(struct dlm_ls *ls, int mb_len,
char *mb;
/* get_buffer gives us a message handle (mh) that we need to
- pass into lowcomms_commit and a message buffer (mb) that we
+ pass into midcomms_commit and a message buffer (mb) that we
write our data into */
- mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, GFP_NOFS, &mb);
+ mh = dlm_midcomms_get_mhandle(to_nodeid, mb_len, GFP_NOFS, &mb);
if (!mh)
return -ENOBUFS;
ms = (struct dlm_message *) mb;
ms->m_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
- ms->m_header.h_lockspace = ls->ls_global_id;
+ ms->m_header.u.h_lockspace = ls->ls_global_id;
ms->m_header.h_nodeid = dlm_our_nodeid();
ms->m_header.h_length = mb_len;
ms->m_header.h_cmd = DLM_MSG;
@@ -3589,7 +3589,7 @@ static int create_message(struct dlm_rsb *r, struct dlm_lkb *lkb,
static int send_message(struct dlm_mhandle *mh, struct dlm_message *ms)
{
dlm_message_out(ms);
- dlm_lowcomms_commit_buffer(mh);
+ dlm_midcomms_commit_mhandle(mh);
return 0;
}
@@ -5038,16 +5038,16 @@ void dlm_receive_buffer(union dlm_packet *p, int nodeid)
if (hd->h_nodeid != nodeid) {
log_print("invalid h_nodeid %d from %d lockspace %x",
- hd->h_nodeid, nodeid, hd->h_lockspace);
+ hd->h_nodeid, nodeid, hd->u.h_lockspace);
return;
}
- ls = dlm_find_lockspace_global(hd->h_lockspace);
+ ls = dlm_find_lockspace_global(hd->u.h_lockspace);
if (!ls) {
if (dlm_config.ci_log_debug) {
printk_ratelimited(KERN_DEBUG "dlm: invalid lockspace "
"%u from %d cmd %d type %d\n",
- hd->h_lockspace, nodeid, hd->h_cmd, type);
+ hd->u.h_lockspace, nodeid, hd->h_cmd, type);
}
if (hd->h_cmd == DLM_RCOM && type == DLM_RCOM_STATUS)
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index c14cf2b7faab..d71aba8c3e64 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -16,6 +16,7 @@
#include "member.h"
#include "recoverd.h"
#include "dir.h"
+#include "midcomms.h"
#include "lowcomms.h"
#include "config.h"
#include "memory.h"
@@ -390,7 +391,7 @@ static int threads_start(void)
}
/* Thread for sending/receiving messages for all lockspace's */
- error = dlm_lowcomms_start();
+ error = dlm_midcomms_start();
if (error) {
log_print("cannot start dlm lowcomms %d", error);
goto scand_fail;
@@ -566,7 +567,12 @@ static int new_lockspace(const char *name, const char *cluster,
mutex_init(&ls->ls_requestqueue_mutex);
mutex_init(&ls->ls_clear_proc_locks);
- ls->ls_recover_buf = kmalloc(LOWCOMMS_MAX_TX_BUFFER_LEN, GFP_NOFS);
+ /* Due backwards compatibility with 3.1 we need to use maximum
+ * possible dlm message size to be sure the message will fit and
+ * not having out of bounds issues. However on sending side 3.2
+ * might send less.
+ */
+ ls->ls_recover_buf = kmalloc(DLM_MAX_SOCKET_BUFSIZE, GFP_NOFS);
if (!ls->ls_recover_buf)
goto out_lkbidr;
@@ -698,7 +704,7 @@ int dlm_new_lockspace(const char *name, const char *cluster,
error = 0;
if (!ls_count) {
dlm_scand_stop();
- dlm_lowcomms_shutdown();
+ dlm_midcomms_shutdown();
dlm_lowcomms_stop();
}
out:
@@ -787,7 +793,7 @@ static int release_lockspace(struct dlm_ls *ls, int force)
if (ls_count == 1) {
dlm_scand_stop();
- dlm_lowcomms_shutdown();
+ dlm_midcomms_shutdown();
}
dlm_callback_stop(ls);
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 166e36fcf3e4..0ea9ae35da0b 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -59,7 +59,6 @@
#include "config.h"
#define NEEDED_RMEM (4*1024*1024)
-#define CONN_HASH_SIZE 32
/* Number of messages to send before rescheduling */
#define MAX_SEND_MSG_COUNT 25
@@ -79,14 +78,20 @@ struct connection {
#define CF_CLOSING 8
#define CF_SHUTDOWN 9
#define CF_CONNECTED 10
+#define CF_RECONNECT 11
+#define CF_DELAY_CONNECT 12
+#define CF_EOF 13
struct list_head writequeue; /* List of outgoing writequeue_entries */
spinlock_t writequeue_lock;
+ atomic_t writequeue_cnt;
void (*connect_action) (struct connection *); /* What to do to connect */
void (*shutdown_action)(struct connection *con); /* What to do to shutdown */
+ bool (*eof_condition)(struct connection *con); /* What to do to eof check */
int retries;
#define MAX_CONNECT_RETRIES 3
struct hlist_node list;
struct connection *othercon;
+ struct connection *sendcon;
struct work_struct rwork; /* Receive workqueue */
struct work_struct swork; /* Send workqueue */
wait_queue_head_t shutdown_wait; /* wait for graceful shutdown */
@@ -113,7 +118,22 @@ struct writequeue_entry {
int len;
int end;
int users;
+ bool dirty;
struct connection *con;
+ struct list_head msgs;
+ struct kref ref;
+};
+
+struct dlm_msg {
+ struct writequeue_entry *entry;
+ struct dlm_msg *orig_msg;
+ bool retransmit;
+ void *ppc;
+ int len;
+ int idx; /* new()/commit() idx exchange */
+
+ struct list_head list;
+ struct kref ref;
};
struct dlm_node_addr {
@@ -155,33 +175,23 @@ static void sctp_connect_to_sock(struct connection *con);
static void tcp_connect_to_sock(struct connection *con);
static void dlm_tcp_shutdown(struct connection *con);
-/* This is deliberately very simple because most clusters have simple
- sequential nodeids, so we should be able to go straight to a connection
- struct in the array */
-static inline int nodeid_hash(int nodeid)
+static struct connection *__find_con(int nodeid, int r)
{
- return nodeid & (CONN_HASH_SIZE-1);
-}
-
-static struct connection *__find_con(int nodeid)
-{
- int r, idx;
struct connection *con;
- r = nodeid_hash(nodeid);
-
- idx = srcu_read_lock(&connections_srcu);
hlist_for_each_entry_rcu(con, &connection_hash[r], list) {
- if (con->nodeid == nodeid) {
- srcu_read_unlock(&connections_srcu, idx);
+ if (con->nodeid == nodeid)
return con;
- }
}
- srcu_read_unlock(&connections_srcu, idx);
return NULL;
}
+static bool tcp_eof_condition(struct connection *con)
+{
+ return atomic_read(&con->writequeue_cnt);
+}
+
static int dlm_con_init(struct connection *con, int nodeid)
{
con->rx_buflen = dlm_config.ci_buffer_size;
@@ -193,15 +203,23 @@ static int dlm_con_init(struct connection *con, int nodeid)
mutex_init(&con->sock_mutex);
INIT_LIST_HEAD(&con->writequeue);
spin_lock_init(&con->writequeue_lock);
+ atomic_set(&con->writequeue_cnt, 0);
INIT_WORK(&con->swork, process_send_sockets);
INIT_WORK(&con->rwork, process_recv_sockets);
init_waitqueue_head(&con->shutdown_wait);
- if (dlm_config.ci_protocol == 0) {
+ switch (dlm_config.ci_protocol) {
+ case DLM_PROTO_TCP:
con->connect_action = tcp_connect_to_sock;
con->shutdown_action = dlm_tcp_shutdown;
- } else {
+ con->eof_condition = tcp_eof_condition;
+ break;
+ case DLM_PROTO_SCTP:
con->connect_action = sctp_connect_to_sock;
+ break;
+ default:
+ kfree(con->rx_buf);
+ return -EINVAL;
}
return 0;
@@ -216,7 +234,8 @@ static struct connection *nodeid2con(int nodeid, gfp_t alloc)
struct connection *con, *tmp;
int r, ret;
- con = __find_con(nodeid);
+ r = nodeid_hash(nodeid);
+ con = __find_con(nodeid, r);
if (con || !alloc)
return con;
@@ -230,8 +249,6 @@ static struct connection *nodeid2con(int nodeid, gfp_t alloc)
return NULL;
}
- r = nodeid_hash(nodeid);
-
spin_lock(&connections_lock);
/* Because multiple workqueues/threads calls this function it can
* race on multiple cpu's. Instead of locking hot path __find_con()
@@ -239,7 +256,7 @@ static struct connection *nodeid2con(int nodeid, gfp_t alloc)
* under protection of connections_lock. If this is the case we
* abort our connection creation and return the existing connection.
*/
- tmp = __find_con(nodeid);
+ tmp = __find_con(nodeid, r);
if (tmp) {
spin_unlock(&connections_lock);
kfree(con->rx_buf);
@@ -256,15 +273,13 @@ static struct connection *nodeid2con(int nodeid, gfp_t alloc)
/* Loop round all connections */
static void foreach_conn(void (*conn_func)(struct connection *c))
{
- int i, idx;
+ int i;
struct connection *con;
- idx = srcu_read_lock(&connections_srcu);
for (i = 0; i < CONN_HASH_SIZE; i++) {
hlist_for_each_entry_rcu(con, &connection_hash[i], list)
conn_func(con);
}
- srcu_read_unlock(&connections_srcu, idx);
}
static struct dlm_node_addr *find_node_addr(int nodeid)
@@ -462,6 +477,9 @@ static void lowcomms_data_ready(struct sock *sk)
static void lowcomms_listen_data_ready(struct sock *sk)
{
+ if (!dlm_allow_conn)
+ return;
+
queue_work(recv_workqueue, &listen_con.rwork);
}
@@ -518,14 +536,21 @@ static void lowcomms_state_change(struct sock *sk)
int dlm_lowcomms_connect_node(int nodeid)
{
struct connection *con;
+ int idx;
if (nodeid == dlm_our_nodeid())
return 0;
+ idx = srcu_read_lock(&connections_srcu);
con = nodeid2con(nodeid, GFP_NOFS);
- if (!con)
+ if (!con) {
+ srcu_read_unlock(&connections_srcu, idx);
return -ENOMEM;
+ }
+
lowcomms_connect_sock(con);
+ srcu_read_unlock(&connections_srcu, idx);
+
return 0;
}
@@ -587,6 +612,22 @@ static void lowcomms_error_report(struct sock *sk)
dlm_config.ci_tcp_port, sk->sk_err,
sk->sk_err_soft);
}
+
+ /* below sendcon only handling */
+ if (test_bit(CF_IS_OTHERCON, &con->flags))
+ con = con->sendcon;
+
+ switch (sk->sk_err) {
+ case ECONNREFUSED:
+ set_bit(CF_DELAY_CONNECT, &con->flags);
+ break;
+ default:
+ break;
+ }
+
+ if (!test_and_set_bit(CF_RECONNECT, &con->flags))
+ queue_work(send_workqueue, &con->swork);
+
out:
read_unlock_bh(&sk->sk_callback_lock);
if (orig_report)
@@ -669,6 +710,42 @@ static void make_sockaddr(struct sockaddr_storage *saddr, uint16_t port,
memset((char *)saddr + *addr_len, 0, sizeof(struct sockaddr_storage) - *addr_len);
}
+static void dlm_page_release(struct kref *kref)
+{
+ struct writequeue_entry *e = container_of(kref, struct writequeue_entry,
+ ref);
+
+ __free_page(e->page);
+ kfree(e);
+}
+
+static void dlm_msg_release(struct kref *kref)
+{
+ struct dlm_msg *msg = container_of(kref, struct dlm_msg, ref);
+
+ kref_put(&msg->entry->ref, dlm_page_release);
+ kfree(msg);
+}
+
+static void free_entry(struct writequeue_entry *e)
+{
+ struct dlm_msg *msg, *tmp;
+
+ list_for_each_entry_safe(msg, tmp, &e->msgs, list) {
+ if (msg->orig_msg) {
+ msg->orig_msg->retransmit = false;
+ kref_put(&msg->orig_msg->ref, dlm_msg_release);
+ }
+
+ list_del(&msg->list);
+ kref_put(&msg->ref, dlm_msg_release);
+ }
+
+ list_del(&e->list);
+ atomic_dec(&e->con->writequeue_cnt);
+ kref_put(&e->ref, dlm_page_release);
+}
+
static void dlm_close_sock(struct socket **sock)
{
if (*sock) {
@@ -683,6 +760,7 @@ static void close_connection(struct connection *con, bool and_other,
bool tx, bool rx)
{
bool closing = test_and_set_bit(CF_CLOSING, &con->flags);
+ struct writequeue_entry *e;
if (tx && !closing && cancel_work_sync(&con->swork)) {
log_print("canceled swork for node %d", con->nodeid);
@@ -698,12 +776,35 @@ static void close_connection(struct connection *con, bool and_other,
if (con->othercon && and_other) {
/* Will only re-enter once. */
- close_connection(con->othercon, false, true, true);
+ close_connection(con->othercon, false, tx, rx);
+ }
+
+ /* if we send a writequeue entry only a half way, we drop the
+ * whole entry because reconnection and that we not start of the
+ * middle of a msg which will confuse the other end.
+ *
+ * we can always drop messages because retransmits, but what we
+ * cannot allow is to transmit half messages which may be processed
+ * at the other side.
+ *
+ * our policy is to start on a clean state when disconnects, we don't
+ * know what's send/received on transport layer in this case.
+ */
+ spin_lock(&con->writequeue_lock);
+ if (!list_empty(&con->writequeue)) {
+ e = list_first_entry(&con->writequeue, struct writequeue_entry,
+ list);
+ if (e->dirty)
+ free_entry(e);
}
+ spin_unlock(&con->writequeue_lock);
con->rx_leftover = 0;
con->retries = 0;
clear_bit(CF_CONNECTED, &con->flags);
+ clear_bit(CF_DELAY_CONNECT, &con->flags);
+ clear_bit(CF_RECONNECT, &con->flags);
+ clear_bit(CF_EOF, &con->flags);
mutex_unlock(&con->sock_mutex);
clear_bit(CF_CLOSING, &con->flags);
}
@@ -841,19 +942,26 @@ out_resched:
return -EAGAIN;
out_close:
- mutex_unlock(&con->sock_mutex);
- if (ret != -EAGAIN) {
- /* Reconnect when there is something to send */
- close_connection(con, false, true, false);
- if (ret == 0) {
- log_print("connection %p got EOF from %d",
- con, con->nodeid);
+ if (ret == 0) {
+ log_print("connection %p got EOF from %d",
+ con, con->nodeid);
+
+ if (con->eof_condition && con->eof_condition(con)) {
+ set_bit(CF_EOF, &con->flags);
+ mutex_unlock(&con->sock_mutex);
+ } else {
+ mutex_unlock(&con->sock_mutex);
+ close_connection(con, false, true, false);
+
/* handling for tcp shutdown */
clear_bit(CF_SHUTDOWN, &con->flags);
wake_up(&con->shutdown_wait);
- /* signal to breaking receive worker */
- ret = -1;
}
+
+ /* signal to breaking receive worker */
+ ret = -1;
+ } else {
+ mutex_unlock(&con->sock_mutex);
}
return ret;
}
@@ -864,16 +972,12 @@ static int accept_from_sock(struct listen_connection *con)
int result;
struct sockaddr_storage peeraddr;
struct socket *newsock;
- int len;
+ int len, idx;
int nodeid;
struct connection *newcon;
struct connection *addcon;
unsigned int mark;
- if (!dlm_allow_conn) {
- return -1;
- }
-
if (!con->sock)
return -ENOTCONN;
@@ -907,8 +1011,10 @@ static int accept_from_sock(struct listen_connection *con)
* the same time and the connections cross on the wire.
* In this case we store the incoming one in "othercon"
*/
+ idx = srcu_read_lock(&connections_srcu);
newcon = nodeid2con(nodeid, GFP_NOFS);
if (!newcon) {
+ srcu_read_unlock(&connections_srcu, idx);
result = -ENOMEM;
goto accept_err;
}
@@ -924,6 +1030,7 @@ static int accept_from_sock(struct listen_connection *con)
if (!othercon) {
log_print("failed to allocate incoming socket");
mutex_unlock(&newcon->sock_mutex);
+ srcu_read_unlock(&connections_srcu, idx);
result = -ENOMEM;
goto accept_err;
}
@@ -932,11 +1039,14 @@ static int accept_from_sock(struct listen_connection *con)
if (result < 0) {
kfree(othercon);
mutex_unlock(&newcon->sock_mutex);
+ srcu_read_unlock(&connections_srcu, idx);
goto accept_err;
}
lockdep_set_subclass(&othercon->sock_mutex, 1);
+ set_bit(CF_IS_OTHERCON, &othercon->flags);
newcon->othercon = othercon;
+ othercon->sendcon = newcon;
} else {
/* close other sock con if we have something new */
close_connection(othercon, false, true, false);
@@ -966,6 +1076,8 @@ static int accept_from_sock(struct listen_connection *con)
if (!test_and_set_bit(CF_READ_PENDING, &addcon->flags))
queue_work(recv_workqueue, &addcon->rwork);
+ srcu_read_unlock(&connections_srcu, idx);
+
return 0;
accept_err:
@@ -977,12 +1089,6 @@ accept_err:
return result;
}
-static void free_entry(struct writequeue_entry *e)
-{
- __free_page(e->page);
- kfree(e);
-}
-
/*
* writequeue_entry_complete - try to delete and free write queue entry
* @e: write queue entry to try to delete
@@ -994,11 +1100,11 @@ static void writequeue_entry_complete(struct writequeue_entry *e, int completed)
{
e->offset += completed;
e->len -= completed;
+ /* signal that page was half way transmitted */
+ e->dirty = true;
- if (e->len == 0 && e->users == 0) {
- list_del(&e->list);
+ if (e->len == 0 && e->users == 0)
free_entry(e);
- }
}
/*
@@ -1075,7 +1181,7 @@ static void sctp_connect_to_sock(struct connection *con)
make_sockaddr(&daddr, dlm_config.ci_tcp_port, &addr_len);
- log_print("connecting to %d", con->nodeid);
+ log_print_ratelimited("connecting to %d", con->nodeid);
/* Turn off Nagle's algorithm */
sctp_sock_set_nodelay(sock->sk);
@@ -1171,7 +1277,7 @@ static void tcp_connect_to_sock(struct connection *con)
make_sockaddr(&saddr, dlm_config.ci_tcp_port, &addr_len);
- log_print("connecting to %d", con->nodeid);
+ log_print_ratelimited("connecting to %d", con->nodeid);
/* Turn off Nagle's algorithm */
tcp_sock_set_nodelay(sock->sk);
@@ -1364,12 +1470,16 @@ static struct writequeue_entry *new_writequeue_entry(struct connection *con,
entry->con = con;
entry->users = 1;
+ kref_init(&entry->ref);
+ INIT_LIST_HEAD(&entry->msgs);
return entry;
}
static struct writequeue_entry *new_wq_entry(struct connection *con, int len,
- gfp_t allocation, char **ppc)
+ gfp_t allocation, char **ppc,
+ void (*cb)(struct dlm_mhandle *mh),
+ struct dlm_mhandle *mh)
{
struct writequeue_entry *e;
@@ -1377,7 +1487,12 @@ static struct writequeue_entry *new_wq_entry(struct connection *con, int len,
if (!list_empty(&con->writequeue)) {
e = list_last_entry(&con->writequeue, struct writequeue_entry, list);
if (DLM_WQ_REMAIN_BYTES(e) >= len) {
+ kref_get(&e->ref);
+
*ppc = page_address(e->page) + e->end;
+ if (cb)
+ cb(mh);
+
e->end += len;
e->users++;
spin_unlock(&con->writequeue_lock);
@@ -1391,42 +1506,92 @@ static struct writequeue_entry *new_wq_entry(struct connection *con, int len,
if (!e)
return NULL;
+ kref_get(&e->ref);
*ppc = page_address(e->page);
e->end += len;
+ atomic_inc(&con->writequeue_cnt);
spin_lock(&con->writequeue_lock);
+ if (cb)
+ cb(mh);
+
list_add_tail(&e->list, &con->writequeue);
spin_unlock(&con->writequeue_lock);
return e;
};
-void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc)
+static struct dlm_msg *dlm_lowcomms_new_msg_con(struct connection *con, int len,
+ gfp_t allocation, char **ppc,
+ void (*cb)(struct dlm_mhandle *mh),
+ struct dlm_mhandle *mh)
+{
+ struct writequeue_entry *e;
+ struct dlm_msg *msg;
+
+ msg = kzalloc(sizeof(*msg), allocation);
+ if (!msg)
+ return NULL;
+
+ kref_init(&msg->ref);
+
+ e = new_wq_entry(con, len, allocation, ppc, cb, mh);
+ if (!e) {
+ kfree(msg);
+ return NULL;
+ }
+
+ msg->ppc = *ppc;
+ msg->len = len;
+ msg->entry = e;
+
+ return msg;
+}
+
+struct dlm_msg *dlm_lowcomms_new_msg(int nodeid, int len, gfp_t allocation,
+ char **ppc, void (*cb)(struct dlm_mhandle *mh),
+ struct dlm_mhandle *mh)
{
struct connection *con;
+ struct dlm_msg *msg;
+ int idx;
- if (len > DEFAULT_BUFFER_SIZE ||
+ if (len > DLM_MAX_SOCKET_BUFSIZE ||
len < sizeof(struct dlm_header)) {
- BUILD_BUG_ON(PAGE_SIZE < DEFAULT_BUFFER_SIZE);
+ BUILD_BUG_ON(PAGE_SIZE < DLM_MAX_SOCKET_BUFSIZE);
log_print("failed to allocate a buffer of size %d", len);
WARN_ON(1);
return NULL;
}
+ idx = srcu_read_lock(&connections_srcu);
con = nodeid2con(nodeid, allocation);
- if (!con)
+ if (!con) {
+ srcu_read_unlock(&connections_srcu, idx);
return NULL;
+ }
- return new_wq_entry(con, len, allocation, ppc);
+ msg = dlm_lowcomms_new_msg_con(con, len, allocation, ppc, cb, mh);
+ if (!msg) {
+ srcu_read_unlock(&connections_srcu, idx);
+ return NULL;
+ }
+
+ /* we assume if successful commit must called */
+ msg->idx = idx;
+ return msg;
}
-void dlm_lowcomms_commit_buffer(void *mh)
+static void _dlm_lowcomms_commit_msg(struct dlm_msg *msg)
{
- struct writequeue_entry *e = (struct writequeue_entry *)mh;
+ struct writequeue_entry *e = msg->entry;
struct connection *con = e->con;
int users;
spin_lock(&con->writequeue_lock);
+ kref_get(&msg->ref);
+ list_add(&msg->list, &e->msgs);
+
users = --e->users;
if (users)
goto out;
@@ -1442,6 +1607,42 @@ out:
return;
}
+void dlm_lowcomms_commit_msg(struct dlm_msg *msg)
+{
+ _dlm_lowcomms_commit_msg(msg);
+ srcu_read_unlock(&connections_srcu, msg->idx);
+}
+
+void dlm_lowcomms_put_msg(struct dlm_msg *msg)
+{
+ kref_put(&msg->ref, dlm_msg_release);
+}
+
+/* does not held connections_srcu, usage workqueue only */
+int dlm_lowcomms_resend_msg(struct dlm_msg *msg)
+{
+ struct dlm_msg *msg_resend;
+ char *ppc;
+
+ if (msg->retransmit)
+ return 1;
+
+ msg_resend = dlm_lowcomms_new_msg_con(msg->entry->con, msg->len,
+ GFP_ATOMIC, &ppc, NULL, NULL);
+ if (!msg_resend)
+ return -ENOMEM;
+
+ msg->retransmit = true;
+ kref_get(&msg->ref);
+ msg_resend->orig_msg = msg;
+
+ memcpy(ppc, msg->ppc, msg->len);
+ _dlm_lowcomms_commit_msg(msg_resend);
+ dlm_lowcomms_put_msg(msg_resend);
+
+ return 0;
+}
+
/* Send a message */
static void send_to_sock(struct connection *con)
{
@@ -1483,7 +1684,7 @@ static void send_to_sock(struct connection *con)
cond_resched();
goto out;
} else if (ret < 0)
- goto send_error;
+ goto out;
}
/* Don't starve people filling buffers */
@@ -1496,16 +1697,23 @@ static void send_to_sock(struct connection *con)
writequeue_entry_complete(e, ret);
}
spin_unlock(&con->writequeue_lock);
-out:
- mutex_unlock(&con->sock_mutex);
+
+ /* close if we got EOF */
+ if (test_and_clear_bit(CF_EOF, &con->flags)) {
+ mutex_unlock(&con->sock_mutex);
+ close_connection(con, false, false, true);
+
+ /* handling for tcp shutdown */
+ clear_bit(CF_SHUTDOWN, &con->flags);
+ wake_up(&con->shutdown_wait);
+ } else {
+ mutex_unlock(&con->sock_mutex);
+ }
+
return;
-send_error:
+out:
mutex_unlock(&con->sock_mutex);
- close_connection(con, false, false, true);
- /* Requeue the send work. When the work daemon runs again, it will try
- a new connection, then call this function again. */
- queue_work(send_workqueue, &con->swork);
return;
out_connect:
@@ -1520,7 +1728,6 @@ static void clean_one_writequeue(struct connection *con)
spin_lock(&con->writequeue_lock);
list_for_each_entry_safe(e, safe, &con->writequeue, list) {
- list_del(&e->list);
free_entry(e);
}
spin_unlock(&con->writequeue_lock);
@@ -1532,8 +1739,10 @@ int dlm_lowcomms_close(int nodeid)
{
struct connection *con;
struct dlm_node_addr *na;
+ int idx;
log_print("closing connection to node %d", nodeid);
+ idx = srcu_read_lock(&connections_srcu);
con = nodeid2con(nodeid, 0);
if (con) {
set_bit(CF_CLOSE, &con->flags);
@@ -1542,6 +1751,7 @@ int dlm_lowcomms_close(int nodeid)
if (con->othercon)
clean_one_writequeue(con->othercon);
}
+ srcu_read_unlock(&connections_srcu, idx);
spin_lock(&dlm_node_addrs_spin);
na = find_node_addr(nodeid);
@@ -1578,35 +1788,50 @@ static void process_send_sockets(struct work_struct *work)
{
struct connection *con = container_of(work, struct connection, swork);
+ WARN_ON(test_bit(CF_IS_OTHERCON, &con->flags));
+
clear_bit(CF_WRITE_PENDING, &con->flags);
- if (con->sock == NULL) /* not mutex protected so check it inside too */
+
+ if (test_and_clear_bit(CF_RECONNECT, &con->flags)) {
+ close_connection(con, false, false, true);
+ dlm_midcomms_unack_msg_resend(con->nodeid);
+ }
+
+ if (con->sock == NULL) { /* not mutex protected so check it inside too */
+ if (test_and_clear_bit(CF_DELAY_CONNECT, &con->flags))
+ msleep(1000);
con->connect_action(con);
+ }
if (!list_empty(&con->writequeue))
send_to_sock(con);
}
static void work_stop(void)
{
- if (recv_workqueue)
+ if (recv_workqueue) {
destroy_workqueue(recv_workqueue);
- if (send_workqueue)
+ recv_workqueue = NULL;
+ }
+
+ if (send_workqueue) {
destroy_workqueue(send_workqueue);
+ send_workqueue = NULL;
+ }
}
static int work_start(void)
{
- recv_workqueue = alloc_workqueue("dlm_recv",
- WQ_UNBOUND | WQ_MEM_RECLAIM, 1);
+ recv_workqueue = alloc_ordered_workqueue("dlm_recv", WQ_MEM_RECLAIM);
if (!recv_workqueue) {
log_print("can't start dlm_recv");
return -ENOMEM;
}
- send_workqueue = alloc_workqueue("dlm_send",
- WQ_UNBOUND | WQ_MEM_RECLAIM, 1);
+ send_workqueue = alloc_ordered_workqueue("dlm_send", WQ_MEM_RECLAIM);
if (!send_workqueue) {
log_print("can't start dlm_send");
destroy_workqueue(recv_workqueue);
+ recv_workqueue = NULL;
return -ENOMEM;
}
@@ -1621,6 +1846,8 @@ static void shutdown_conn(struct connection *con)
void dlm_lowcomms_shutdown(void)
{
+ int idx;
+
/* Set all the flags to prevent any
* socket activity.
*/
@@ -1633,7 +1860,9 @@ void dlm_lowcomms_shutdown(void)
dlm_close_sock(&listen_con.sock);
+ idx = srcu_read_lock(&connections_srcu);
foreach_conn(shutdown_conn);
+ srcu_read_unlock(&connections_srcu, idx);
}
static void _stop_conn(struct connection *con, bool and_other)
@@ -1682,7 +1911,7 @@ static void free_conn(struct connection *con)
static void work_flush(void)
{
- int ok, idx;
+ int ok;
int i;
struct connection *con;
@@ -1693,7 +1922,6 @@ static void work_flush(void)
flush_workqueue(recv_workqueue);
if (send_workqueue)
flush_workqueue(send_workqueue);
- idx = srcu_read_lock(&connections_srcu);
for (i = 0; i < CONN_HASH_SIZE && ok; i++) {
hlist_for_each_entry_rcu(con, &connection_hash[i],
list) {
@@ -1707,14 +1935,17 @@ static void work_flush(void)
}
}
}
- srcu_read_unlock(&connections_srcu, idx);
} while (!ok);
}
void dlm_lowcomms_stop(void)
{
+ int idx;
+
+ idx = srcu_read_lock(&connections_srcu);
work_flush();
foreach_conn(free_conn);
+ srcu_read_unlock(&connections_srcu, idx);
work_stop();
deinit_local();
}
@@ -1738,15 +1969,24 @@ int dlm_lowcomms_start(void)
error = work_start();
if (error)
- goto fail;
+ goto fail_local;
dlm_allow_conn = 1;
/* Start listening */
- if (dlm_config.ci_protocol == 0)
+ switch (dlm_config.ci_protocol) {
+ case DLM_PROTO_TCP:
error = tcp_listen_for_all();
- else
+ break;
+ case DLM_PROTO_SCTP:
error = sctp_listen_for_all(&listen_con);
+ break;
+ default:
+ log_print("Invalid protocol identifier %d set",
+ dlm_config.ci_protocol);
+ error = -EINVAL;
+ break;
+ }
if (error)
goto fail_unlisten;
@@ -1755,6 +1995,9 @@ int dlm_lowcomms_start(void)
fail_unlisten:
dlm_allow_conn = 0;
dlm_close_sock(&listen_con.sock);
+ work_stop();
+fail_local:
+ deinit_local();
fail:
return error;
}
diff --git a/fs/dlm/lowcomms.h b/fs/dlm/lowcomms.h
index 48bbc4e18761..aaae7115c00d 100644
--- a/fs/dlm/lowcomms.h
+++ b/fs/dlm/lowcomms.h
@@ -12,7 +12,22 @@
#ifndef __LOWCOMMS_DOT_H__
#define __LOWCOMMS_DOT_H__
-#define LOWCOMMS_MAX_TX_BUFFER_LEN 4096
+#include "dlm_internal.h"
+
+#define DLM_MIDCOMMS_OPT_LEN sizeof(struct dlm_opts)
+#define DLM_MAX_APP_BUFSIZE (DLM_MAX_SOCKET_BUFSIZE - \
+ DLM_MIDCOMMS_OPT_LEN)
+
+#define CONN_HASH_SIZE 32
+
+/* This is deliberately very simple because most clusters have simple
+ * sequential nodeids, so we should be able to go straight to a connection
+ * struct in the array
+ */
+static inline int nodeid_hash(int nodeid)
+{
+ return nodeid & (CONN_HASH_SIZE-1);
+}
/* switch to check if dlm is running */
extern int dlm_allow_conn;
@@ -22,8 +37,12 @@ void dlm_lowcomms_shutdown(void);
void dlm_lowcomms_stop(void);
void dlm_lowcomms_exit(void);
int dlm_lowcomms_close(int nodeid);
-void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc);
-void dlm_lowcomms_commit_buffer(void *mh);
+struct dlm_msg *dlm_lowcomms_new_msg(int nodeid, int len, gfp_t allocation,
+ char **ppc, void (*cb)(struct dlm_mhandle *mh),
+ struct dlm_mhandle *mh);
+void dlm_lowcomms_commit_msg(struct dlm_msg *msg);
+void dlm_lowcomms_put_msg(struct dlm_msg *msg);
+int dlm_lowcomms_resend_msg(struct dlm_msg *msg);
int dlm_lowcomms_connect_node(int nodeid);
int dlm_lowcomms_nodes_set_mark(int nodeid, unsigned int mark);
int dlm_lowcomms_addr(int nodeid, struct sockaddr_storage *addr, int len);
diff --git a/fs/dlm/member.c b/fs/dlm/member.c
index ceef3f2074ff..d9e1e4170eb1 100644
--- a/fs/dlm/member.c
+++ b/fs/dlm/member.c
@@ -15,6 +15,7 @@
#include "recover.h"
#include "rcom.h"
#include "config.h"
+#include "midcomms.h"
#include "lowcomms.h"
int dlm_slots_version(struct dlm_header *h)
@@ -270,7 +271,7 @@ int dlm_slots_assign(struct dlm_ls *ls, int *num_slots, int *slots_size,
log_slots(ls, gen, num, NULL, array, array_size);
- max_slots = (LOWCOMMS_MAX_TX_BUFFER_LEN - sizeof(struct dlm_rcom) -
+ max_slots = (DLM_MAX_APP_BUFSIZE - sizeof(struct dlm_rcom) -
sizeof(struct rcom_config)) / sizeof(struct rcom_slot);
if (num > max_slots) {
@@ -329,6 +330,7 @@ static int dlm_add_member(struct dlm_ls *ls, struct dlm_config_node *node)
memb->nodeid = node->nodeid;
memb->weight = node->weight;
memb->comm_seq = node->comm_seq;
+ dlm_midcomms_add_member(node->nodeid);
add_ordered_member(ls, memb);
ls->ls_num_nodes++;
return 0;
@@ -359,26 +361,34 @@ int dlm_is_removed(struct dlm_ls *ls, int nodeid)
return 0;
}
-static void clear_memb_list(struct list_head *head)
+static void clear_memb_list(struct list_head *head,
+ void (*after_del)(int nodeid))
{
struct dlm_member *memb;
while (!list_empty(head)) {
memb = list_entry(head->next, struct dlm_member, list);
list_del(&memb->list);
+ if (after_del)
+ after_del(memb->nodeid);
kfree(memb);
}
}
+static void clear_members_cb(int nodeid)
+{
+ dlm_midcomms_remove_member(nodeid);
+}
+
void dlm_clear_members(struct dlm_ls *ls)
{
- clear_memb_list(&ls->ls_nodes);
+ clear_memb_list(&ls->ls_nodes, clear_members_cb);
ls->ls_num_nodes = 0;
}
void dlm_clear_members_gone(struct dlm_ls *ls)
{
- clear_memb_list(&ls->ls_nodes_gone);
+ clear_memb_list(&ls->ls_nodes_gone, NULL);
}
static void make_member_array(struct dlm_ls *ls)
@@ -552,6 +562,7 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
neg++;
list_move(&memb->list, &ls->ls_nodes_gone);
+ dlm_midcomms_remove_member(memb->nodeid);
ls->ls_num_nodes--;
dlm_lsop_recover_slot(ls, memb);
}
@@ -576,12 +587,18 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
*neg_out = neg;
error = ping_members(ls);
- if (!error || error == -EPROTO) {
- /* new_lockspace() may be waiting to know if the config
- is good or bad */
- ls->ls_members_result = error;
- complete(&ls->ls_members_done);
- }
+ /* error -EINTR means that a new recovery action is triggered.
+ * We ignore this recovery action and let run the new one which might
+ * have new member configuration.
+ */
+ if (error == -EINTR)
+ error = 0;
+
+ /* new_lockspace() may be waiting to know if the config
+ * is good or bad
+ */
+ ls->ls_members_result = error;
+ complete(&ls->ls_members_done);
log_rinfo(ls, "dlm_recover_members %d nodes", ls->ls_num_nodes);
return error;
diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c
index 1c6654a21ec4..e3de268898ed 100644
--- a/fs/dlm/midcomms.c
+++ b/fs/dlm/midcomms.c
@@ -3,7 +3,7 @@
*******************************************************************************
**
** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
-** Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
+** Copyright (C) 2004-2021 Red Hat, Inc. All rights reserved.
**
**
*******************************************************************************
@@ -12,22 +12,866 @@
/*
* midcomms.c
*
- * This is the appallingly named "mid-level" comms layer.
+ * This is the appallingly named "mid-level" comms layer. It takes care about
+ * deliver an on application layer "reliable" communication above the used
+ * lowcomms transport layer.
*
- * Its purpose is to take packets from the "real" comms layer,
- * split them up into packets and pass them to the interested
- * part of the locking mechanism.
+ * How it works:
*
- * It also takes messages from the locking layer, formats them
- * into packets and sends them to the comms layer.
+ * Each nodes keeps track of all send DLM messages in send_queue with a sequence
+ * number. The receive will send an DLM_ACK message back for every DLM message
+ * received at the other side. If a reconnect happens in lowcomms we will send
+ * all unacknowledged dlm messages again. The receiving side might drop any already
+ * received message by comparing sequence numbers.
+ *
+ * How version detection works:
+ *
+ * Due the fact that dlm has pre-configured node addresses on every side
+ * it is in it's nature that every side connects at starts to transmit
+ * dlm messages which ends in a race. However DLM_RCOM_NAMES, DLM_RCOM_STATUS
+ * and their replies are the first messages which are exchanges. Due backwards
+ * compatibility these messages are not covered by the midcomms re-transmission
+ * layer. These messages have their own re-transmission handling in the dlm
+ * application layer. The version field of every node will be set on these RCOM
+ * messages as soon as they arrived and the node isn't yet part of the nodes
+ * hash. There exists also logic to detect version mismatched if something weird
+ * going on or the first messages isn't an expected one.
+ *
+ * Termination:
+ *
+ * The midcomms layer does a 4 way handshake for termination on DLM protocol
+ * like TCP supports it with half-closed socket support. SCTP doesn't support
+ * half-closed socket, so we do it on DLM layer. Also socket shutdown() can be
+ * interrupted by .e.g. tcp reset itself. Additional there exists the othercon
+ * paradigm in lowcomms which cannot be easily without breaking backwards
+ * compatibility. A node cannot send anything to another node when a DLM_FIN
+ * message was send. There exists additional logic to print a warning if
+ * DLM wants to do it. There exists a state handling like RFC 793 but reduced
+ * to termination only. The event "member removal event" describes the cluster
+ * manager removed the node from internal lists, at this point DLM does not
+ * send any message to the other node. There exists two cases:
+ *
+ * 1. The cluster member was removed and we received a FIN
+ * OR
+ * 2. We received a FIN but the member was not removed yet
+ *
+ * One of these cases will do the CLOSE_WAIT to LAST_ACK change.
+ *
+ *
+ * +---------+
+ * | CLOSED |
+ * +---------+
+ * | add member/receive RCOM version
+ * | detection msg
+ * V
+ * +---------+
+ * | ESTAB |
+ * +---------+
+ * CLOSE | | rcv FIN
+ * ------- | | -------
+ * +---------+ snd FIN / \ snd ACK +---------+
+ * | FIN |<----------------- ------------------>| CLOSE |
+ * | WAIT-1 |------------------ | WAIT |
+ * +---------+ rcv FIN \ +---------+
+ * | rcv ACK of FIN ------- | CLOSE | member
+ * | -------------- snd ACK | ------- | removal
+ * V x V snd FIN V event
+ * +---------+ +---------+ +---------+
+ * |FINWAIT-2| | CLOSING | | LAST-ACK|
+ * +---------+ +---------+ +---------+
+ * | rcv ACK of FIN | rcv ACK of FIN |
+ * | rcv FIN -------------- | -------------- |
+ * | ------- x V x V
+ * \ snd ACK +---------+ +---------+
+ * ------------------------>| CLOSED | | CLOSED |
+ * +---------+ +---------+
+ *
+ * NOTE: any state can interrupted by midcomms_close() and state will be
+ * switched to CLOSED in case of fencing. There exists also some timeout
+ * handling when we receive the version detection RCOM messages which is
+ * made by observation.
+ *
+ * Future improvements:
+ *
+ * There exists some known issues/improvements of the dlm handling. Some
+ * of them should be done in a next major dlm version bump which makes
+ * it incompatible with previous versions.
+ *
+ * Unaligned memory access:
+ *
+ * There exists cases when the dlm message buffer length is not aligned
+ * to 8 byte. However seems nobody detected any problem with it. This
+ * can be fixed in the next major version bump of dlm.
+ *
+ * Version detection:
+ *
+ * The version detection and how it's done is related to backwards
+ * compatibility. There exists better ways to make a better handling.
+ * However this should be changed in the next major version bump of dlm.
+ *
+ * Ack handling:
+ *
+ * Currently we send an ack message for every dlm message. However we
+ * can ack multiple dlm messages with one ack by just delaying the ack
+ * message. Will reduce some traffic but makes the drop detection slower.
+ *
+ * Tail Size checking:
+ *
+ * There exists a message tail payload in e.g. DLM_MSG however we don't
+ * check it against the message length yet regarding to the receive buffer
+ * length. That need to be validated.
+ *
+ * Fencing bad nodes:
+ *
+ * At timeout places or weird sequence number behaviours we should send
+ * a fencing request to the cluster manager.
+ */
+
+/* Debug switch to enable a 5 seconds sleep waiting of a termination.
+ * This can be useful to test fencing while termination is running.
+ * This requires a setup with only gfs2 as dlm user, so that the
+ * last umount will terminate the connection.
+ *
+ * However it became useful to test, while the 5 seconds block in umount
+ * just press the reset button. In a lot of dropping the termination
+ * process can could take several seconds.
*/
+#define DLM_DEBUG_FENCE_TERMINATION 0
+
+#include <net/tcp.h>
#include "dlm_internal.h"
#include "lowcomms.h"
#include "config.h"
#include "lock.h"
+#include "util.h"
#include "midcomms.h"
+/* init value for sequence numbers for testing purpose only e.g. overflows */
+#define DLM_SEQ_INIT 0
+/* 3 minutes wait to sync ending of dlm */
+#define DLM_SHUTDOWN_TIMEOUT msecs_to_jiffies(3 * 60 * 1000)
+#define DLM_VERSION_NOT_SET 0
+
+struct midcomms_node {
+ int nodeid;
+ uint32_t version;
+ uint32_t seq_send;
+ uint32_t seq_next;
+ /* These queues are unbound because we cannot drop any message in dlm.
+ * We could send a fence signal for a specific node to the cluster
+ * manager if queues hits some maximum value, however this handling
+ * not supported yet.
+ */
+ struct list_head send_queue;
+ spinlock_t send_queue_lock;
+ atomic_t send_queue_cnt;
+#define DLM_NODE_FLAG_CLOSE 1
+#define DLM_NODE_FLAG_STOP_TX 2
+#define DLM_NODE_FLAG_STOP_RX 3
+ unsigned long flags;
+ wait_queue_head_t shutdown_wait;
+
+ /* dlm tcp termination state */
+#define DLM_CLOSED 1
+#define DLM_ESTABLISHED 2
+#define DLM_FIN_WAIT1 3
+#define DLM_FIN_WAIT2 4
+#define DLM_CLOSE_WAIT 5
+#define DLM_LAST_ACK 6
+#define DLM_CLOSING 7
+ int state;
+ spinlock_t state_lock;
+
+ /* counts how many lockspaces are using this node
+ * this refcount is necessary to determine if the
+ * node wants to disconnect.
+ */
+ int users;
+
+ /* not protected by srcu, node_hash lifetime */
+ void *debugfs;
+
+ struct hlist_node hlist;
+ struct rcu_head rcu;
+};
+
+struct dlm_mhandle {
+ const struct dlm_header *inner_hd;
+ struct midcomms_node *node;
+ struct dlm_opts *opts;
+ struct dlm_msg *msg;
+ bool committed;
+ uint32_t seq;
+
+ void (*ack_rcv)(struct midcomms_node *node);
+
+ /* get_mhandle/commit srcu idx exchange */
+ int idx;
+
+ struct list_head list;
+ struct rcu_head rcu;
+};
+
+static struct hlist_head node_hash[CONN_HASH_SIZE];
+static DEFINE_SPINLOCK(nodes_lock);
+DEFINE_STATIC_SRCU(nodes_srcu);
+
+/* This mutex prevents that midcomms_close() is running while
+ * stop() or remove(). As I experienced invalid memory access
+ * behaviours when DLM_DEBUG_FENCE_TERMINATION is enabled and
+ * resetting machines. I will end in some double deletion in nodes
+ * datastructure.
+ */
+static DEFINE_MUTEX(close_lock);
+
+static inline const char *dlm_state_str(int state)
+{
+ switch (state) {
+ case DLM_CLOSED:
+ return "CLOSED";
+ case DLM_ESTABLISHED:
+ return "ESTABLISHED";
+ case DLM_FIN_WAIT1:
+ return "FIN_WAIT1";
+ case DLM_FIN_WAIT2:
+ return "FIN_WAIT2";
+ case DLM_CLOSE_WAIT:
+ return "CLOSE_WAIT";
+ case DLM_LAST_ACK:
+ return "LAST_ACK";
+ case DLM_CLOSING:
+ return "CLOSING";
+ default:
+ return "UNKNOWN";
+ }
+}
+
+const char *dlm_midcomms_state(struct midcomms_node *node)
+{
+ return dlm_state_str(node->state);
+}
+
+unsigned long dlm_midcomms_flags(struct midcomms_node *node)
+{
+ return node->flags;
+}
+
+int dlm_midcomms_send_queue_cnt(struct midcomms_node *node)
+{
+ return atomic_read(&node->send_queue_cnt);
+}
+
+uint32_t dlm_midcomms_version(struct midcomms_node *node)
+{
+ return node->version;
+}
+
+static struct midcomms_node *__find_node(int nodeid, int r)
+{
+ struct midcomms_node *node;
+
+ hlist_for_each_entry_rcu(node, &node_hash[r], hlist) {
+ if (node->nodeid == nodeid)
+ return node;
+ }
+
+ return NULL;
+}
+
+static void dlm_mhandle_release(struct rcu_head *rcu)
+{
+ struct dlm_mhandle *mh = container_of(rcu, struct dlm_mhandle, rcu);
+
+ dlm_lowcomms_put_msg(mh->msg);
+ kfree(mh);
+}
+
+static void dlm_mhandle_delete(struct midcomms_node *node,
+ struct dlm_mhandle *mh)
+{
+ list_del_rcu(&mh->list);
+ atomic_dec(&node->send_queue_cnt);
+ call_rcu(&mh->rcu, dlm_mhandle_release);
+}
+
+static void dlm_send_queue_flush(struct midcomms_node *node)
+{
+ struct dlm_mhandle *mh;
+
+ pr_debug("flush midcomms send queue of node %d\n", node->nodeid);
+
+ rcu_read_lock();
+ spin_lock(&node->send_queue_lock);
+ list_for_each_entry_rcu(mh, &node->send_queue, list) {
+ dlm_mhandle_delete(node, mh);
+ }
+ spin_unlock(&node->send_queue_lock);
+ rcu_read_unlock();
+}
+
+static void midcomms_node_reset(struct midcomms_node *node)
+{
+ pr_debug("reset node %d\n", node->nodeid);
+
+ node->seq_next = DLM_SEQ_INIT;
+ node->seq_send = DLM_SEQ_INIT;
+ node->version = DLM_VERSION_NOT_SET;
+ node->flags = 0;
+
+ dlm_send_queue_flush(node);
+ node->state = DLM_CLOSED;
+ wake_up(&node->shutdown_wait);
+}
+
+static struct midcomms_node *nodeid2node(int nodeid, gfp_t alloc)
+{
+ struct midcomms_node *node, *tmp;
+ int r = nodeid_hash(nodeid);
+
+ node = __find_node(nodeid, r);
+ if (node || !alloc)
+ return node;
+
+ node = kmalloc(sizeof(*node), alloc);
+ if (!node)
+ return NULL;
+
+ node->nodeid = nodeid;
+ spin_lock_init(&node->state_lock);
+ spin_lock_init(&node->send_queue_lock);
+ atomic_set(&node->send_queue_cnt, 0);
+ INIT_LIST_HEAD(&node->send_queue);
+ init_waitqueue_head(&node->shutdown_wait);
+ node->users = 0;
+ midcomms_node_reset(node);
+
+ spin_lock(&nodes_lock);
+ /* check again if there was somebody else
+ * earlier here to add the node
+ */
+ tmp = __find_node(nodeid, r);
+ if (tmp) {
+ spin_unlock(&nodes_lock);
+ kfree(node);
+ return tmp;
+ }
+
+ hlist_add_head_rcu(&node->hlist, &node_hash[r]);
+ spin_unlock(&nodes_lock);
+
+ node->debugfs = dlm_create_debug_comms_file(nodeid, node);
+ return node;
+}
+
+static int dlm_send_ack(int nodeid, uint32_t seq)
+{
+ int mb_len = sizeof(struct dlm_header);
+ struct dlm_header *m_header;
+ struct dlm_msg *msg;
+ char *ppc;
+
+ msg = dlm_lowcomms_new_msg(nodeid, mb_len, GFP_NOFS, &ppc,
+ NULL, NULL);
+ if (!msg)
+ return -ENOMEM;
+
+ m_header = (struct dlm_header *)ppc;
+
+ m_header->h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
+ m_header->h_nodeid = dlm_our_nodeid();
+ m_header->h_length = mb_len;
+ m_header->h_cmd = DLM_ACK;
+ m_header->u.h_seq = seq;
+
+ header_out(m_header);
+ dlm_lowcomms_commit_msg(msg);
+ dlm_lowcomms_put_msg(msg);
+
+ return 0;
+}
+
+static int dlm_send_fin(struct midcomms_node *node,
+ void (*ack_rcv)(struct midcomms_node *node))
+{
+ int mb_len = sizeof(struct dlm_header);
+ struct dlm_header *m_header;
+ struct dlm_mhandle *mh;
+ char *ppc;
+
+ mh = dlm_midcomms_get_mhandle(node->nodeid, mb_len, GFP_NOFS, &ppc);
+ if (!mh)
+ return -ENOMEM;
+
+ mh->ack_rcv = ack_rcv;
+
+ m_header = (struct dlm_header *)ppc;
+
+ m_header->h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
+ m_header->h_nodeid = dlm_our_nodeid();
+ m_header->h_length = mb_len;
+ m_header->h_cmd = DLM_FIN;
+
+ header_out(m_header);
+
+ pr_debug("sending fin msg to node %d\n", node->nodeid);
+ dlm_midcomms_commit_mhandle(mh);
+ set_bit(DLM_NODE_FLAG_STOP_TX, &node->flags);
+
+ return 0;
+}
+
+static void dlm_receive_ack(struct midcomms_node *node, uint32_t seq)
+{
+ struct dlm_mhandle *mh;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(mh, &node->send_queue, list) {
+ if (before(mh->seq, seq)) {
+ if (mh->ack_rcv)
+ mh->ack_rcv(node);
+ } else {
+ /* send queue should be ordered */
+ break;
+ }
+ }
+
+ spin_lock(&node->send_queue_lock);
+ list_for_each_entry_rcu(mh, &node->send_queue, list) {
+ if (before(mh->seq, seq)) {
+ dlm_mhandle_delete(node, mh);
+ } else {
+ /* send queue should be ordered */
+ break;
+ }
+ }
+ spin_unlock(&node->send_queue_lock);
+ rcu_read_unlock();
+}
+
+static void dlm_pas_fin_ack_rcv(struct midcomms_node *node)
+{
+ spin_lock(&node->state_lock);
+ pr_debug("receive passive fin ack from node %d with state %s\n",
+ node->nodeid, dlm_state_str(node->state));
+
+ switch (node->state) {
+ case DLM_LAST_ACK:
+ /* DLM_CLOSED */
+ midcomms_node_reset(node);
+ break;
+ case DLM_CLOSED:
+ /* not valid but somehow we got what we want */
+ wake_up(&node->shutdown_wait);
+ break;
+ default:
+ spin_unlock(&node->state_lock);
+ log_print("%s: unexpected state: %d\n",
+ __func__, node->state);
+ WARN_ON(1);
+ return;
+ }
+ spin_unlock(&node->state_lock);
+}
+
+static void dlm_midcomms_receive_buffer(union dlm_packet *p,
+ struct midcomms_node *node,
+ uint32_t seq)
+{
+ if (seq == node->seq_next) {
+ node->seq_next++;
+ /* send ack before fin */
+ dlm_send_ack(node->nodeid, node->seq_next);
+
+ switch (p->header.h_cmd) {
+ case DLM_FIN:
+ spin_lock(&node->state_lock);
+ pr_debug("receive fin msg from node %d with state %s\n",
+ node->nodeid, dlm_state_str(node->state));
+
+ switch (node->state) {
+ case DLM_ESTABLISHED:
+ node->state = DLM_CLOSE_WAIT;
+ pr_debug("switch node %d to state %s\n",
+ node->nodeid, dlm_state_str(node->state));
+ /* passive shutdown DLM_LAST_ACK case 1
+ * additional we check if the node is used by
+ * cluster manager events at all.
+ */
+ if (node->users == 0) {
+ node->state = DLM_LAST_ACK;
+ pr_debug("switch node %d to state %s case 1\n",
+ node->nodeid, dlm_state_str(node->state));
+ spin_unlock(&node->state_lock);
+ goto send_fin;
+ }
+ break;
+ case DLM_FIN_WAIT1:
+ node->state = DLM_CLOSING;
+ pr_debug("switch node %d to state %s\n",
+ node->nodeid, dlm_state_str(node->state));
+ break;
+ case DLM_FIN_WAIT2:
+ midcomms_node_reset(node);
+ pr_debug("switch node %d to state %s\n",
+ node->nodeid, dlm_state_str(node->state));
+ wake_up(&node->shutdown_wait);
+ break;
+ case DLM_LAST_ACK:
+ /* probably remove_member caught it, do nothing */
+ break;
+ default:
+ spin_unlock(&node->state_lock);
+ log_print("%s: unexpected state: %d\n",
+ __func__, node->state);
+ WARN_ON(1);
+ return;
+ }
+ spin_unlock(&node->state_lock);
+
+ set_bit(DLM_NODE_FLAG_STOP_RX, &node->flags);
+ break;
+ default:
+ WARN_ON(test_bit(DLM_NODE_FLAG_STOP_RX, &node->flags));
+ dlm_receive_buffer(p, node->nodeid);
+ break;
+ }
+ } else {
+ /* retry to ack message which we already have by sending back
+ * current node->seq_next number as ack.
+ */
+ if (seq < node->seq_next)
+ dlm_send_ack(node->nodeid, node->seq_next);
+
+ log_print_ratelimited("ignore dlm msg because seq mismatch, seq: %u, expected: %u, nodeid: %d",
+ seq, node->seq_next, node->nodeid);
+ }
+
+ return;
+
+send_fin:
+ set_bit(DLM_NODE_FLAG_STOP_RX, &node->flags);
+ dlm_send_fin(node, dlm_pas_fin_ack_rcv);
+}
+
+static struct midcomms_node *
+dlm_midcomms_recv_node_lookup(int nodeid, const union dlm_packet *p,
+ uint16_t msglen, int (*cb)(struct midcomms_node *node))
+{
+ struct midcomms_node *node = NULL;
+ gfp_t allocation = 0;
+ int ret;
+
+ switch (p->header.h_cmd) {
+ case DLM_RCOM:
+ if (msglen < sizeof(struct dlm_rcom)) {
+ log_print("rcom msg too small: %u, will skip this message from node %d",
+ msglen, nodeid);
+ return NULL;
+ }
+
+ switch (le32_to_cpu(p->rcom.rc_type)) {
+ case DLM_RCOM_NAMES:
+ fallthrough;
+ case DLM_RCOM_NAMES_REPLY:
+ fallthrough;
+ case DLM_RCOM_STATUS:
+ fallthrough;
+ case DLM_RCOM_STATUS_REPLY:
+ node = nodeid2node(nodeid, 0);
+ if (node) {
+ spin_lock(&node->state_lock);
+ if (node->state != DLM_ESTABLISHED)
+ pr_debug("receive begin RCOM msg from node %d with state %s\n",
+ node->nodeid, dlm_state_str(node->state));
+
+ switch (node->state) {
+ case DLM_CLOSED:
+ node->state = DLM_ESTABLISHED;
+ pr_debug("switch node %d to state %s\n",
+ node->nodeid, dlm_state_str(node->state));
+ break;
+ case DLM_ESTABLISHED:
+ break;
+ default:
+ /* some invalid state passive shutdown
+ * was failed, we try to reset and
+ * hope it will go on.
+ */
+ log_print("reset node %d because shutdown stuck",
+ node->nodeid);
+
+ midcomms_node_reset(node);
+ node->state = DLM_ESTABLISHED;
+ break;
+ }
+ spin_unlock(&node->state_lock);
+ }
+
+ allocation = GFP_NOFS;
+ break;
+ default:
+ break;
+ }
+
+ break;
+ default:
+ break;
+ }
+
+ node = nodeid2node(nodeid, allocation);
+ if (!node) {
+ switch (p->header.h_cmd) {
+ case DLM_OPTS:
+ if (msglen < sizeof(struct dlm_opts)) {
+ log_print("opts msg too small: %u, will skip this message from node %d",
+ msglen, nodeid);
+ return NULL;
+ }
+
+ log_print_ratelimited("received dlm opts message nextcmd %d from node %d in an invalid sequence",
+ p->opts.o_nextcmd, nodeid);
+ break;
+ default:
+ log_print_ratelimited("received dlm message cmd %d from node %d in an invalid sequence",
+ p->header.h_cmd, nodeid);
+ break;
+ }
+
+ return NULL;
+ }
+
+ ret = cb(node);
+ if (ret < 0)
+ return NULL;
+
+ return node;
+}
+
+static int dlm_midcomms_version_check_3_2(struct midcomms_node *node)
+{
+ switch (node->version) {
+ case DLM_VERSION_NOT_SET:
+ node->version = DLM_VERSION_3_2;
+ log_print("version 0x%08x for node %d detected", DLM_VERSION_3_2,
+ node->nodeid);
+ break;
+ case DLM_VERSION_3_2:
+ break;
+ default:
+ log_print_ratelimited("version mismatch detected, assumed 0x%08x but node %d has 0x%08x",
+ DLM_VERSION_3_2, node->nodeid, node->version);
+ return -1;
+ }
+
+ return 0;
+}
+
+static int dlm_opts_check_msglen(union dlm_packet *p, uint16_t msglen, int nodeid)
+{
+ int len = msglen;
+
+ /* we only trust outer header msglen because
+ * it's checked against receive buffer length.
+ */
+ if (len < sizeof(struct dlm_opts))
+ return -1;
+ len -= sizeof(struct dlm_opts);
+
+ if (len < le16_to_cpu(p->opts.o_optlen))
+ return -1;
+ len -= le16_to_cpu(p->opts.o_optlen);
+
+ switch (p->opts.o_nextcmd) {
+ case DLM_FIN:
+ if (len < sizeof(struct dlm_header)) {
+ log_print("fin too small: %d, will skip this message from node %d",
+ len, nodeid);
+ return -1;
+ }
+
+ break;
+ case DLM_MSG:
+ if (len < sizeof(struct dlm_message)) {
+ log_print("msg too small: %d, will skip this message from node %d",
+ msglen, nodeid);
+ return -1;
+ }
+
+ break;
+ case DLM_RCOM:
+ if (len < sizeof(struct dlm_rcom)) {
+ log_print("rcom msg too small: %d, will skip this message from node %d",
+ len, nodeid);
+ return -1;
+ }
+
+ break;
+ default:
+ log_print("unsupported o_nextcmd received: %u, will skip this message from node %d",
+ p->opts.o_nextcmd, nodeid);
+ return -1;
+ }
+
+ return 0;
+}
+
+static void dlm_midcomms_receive_buffer_3_2(union dlm_packet *p, int nodeid)
+{
+ uint16_t msglen = le16_to_cpu(p->header.h_length);
+ struct midcomms_node *node;
+ uint32_t seq;
+ int ret, idx;
+
+ idx = srcu_read_lock(&nodes_srcu);
+ node = dlm_midcomms_recv_node_lookup(nodeid, p, msglen,
+ dlm_midcomms_version_check_3_2);
+ if (!node)
+ goto out;
+
+ switch (p->header.h_cmd) {
+ case DLM_RCOM:
+ /* these rcom message we use to determine version.
+ * they have their own retransmission handling and
+ * are the first messages of dlm.
+ *
+ * length already checked.
+ */
+ switch (le32_to_cpu(p->rcom.rc_type)) {
+ case DLM_RCOM_NAMES:
+ fallthrough;
+ case DLM_RCOM_NAMES_REPLY:
+ fallthrough;
+ case DLM_RCOM_STATUS:
+ fallthrough;
+ case DLM_RCOM_STATUS_REPLY:
+ break;
+ default:
+ log_print("unsupported rcom type received: %u, will skip this message from node %d",
+ le32_to_cpu(p->rcom.rc_type), nodeid);
+ goto out;
+ }
+
+ WARN_ON(test_bit(DLM_NODE_FLAG_STOP_RX, &node->flags));
+ dlm_receive_buffer(p, nodeid);
+ break;
+ case DLM_OPTS:
+ seq = le32_to_cpu(p->header.u.h_seq);
+
+ ret = dlm_opts_check_msglen(p, msglen, nodeid);
+ if (ret < 0) {
+ log_print("opts msg too small: %u, will skip this message from node %d",
+ msglen, nodeid);
+ goto out;
+ }
+
+ p = (union dlm_packet *)((unsigned char *)p->opts.o_opts +
+ le16_to_cpu(p->opts.o_optlen));
+
+ /* recheck inner msglen just if it's not garbage */
+ msglen = le16_to_cpu(p->header.h_length);
+ switch (p->header.h_cmd) {
+ case DLM_RCOM:
+ if (msglen < sizeof(struct dlm_rcom)) {
+ log_print("inner rcom msg too small: %u, will skip this message from node %d",
+ msglen, nodeid);
+ goto out;
+ }
+
+ break;
+ case DLM_MSG:
+ if (msglen < sizeof(struct dlm_message)) {
+ log_print("inner msg too small: %u, will skip this message from node %d",
+ msglen, nodeid);
+ goto out;
+ }
+
+ break;
+ case DLM_FIN:
+ if (msglen < sizeof(struct dlm_header)) {
+ log_print("inner fin too small: %u, will skip this message from node %d",
+ msglen, nodeid);
+ goto out;
+ }
+
+ break;
+ default:
+ log_print("unsupported inner h_cmd received: %u, will skip this message from node %d",
+ msglen, nodeid);
+ goto out;
+ }
+
+ dlm_midcomms_receive_buffer(p, node, seq);
+ break;
+ case DLM_ACK:
+ seq = le32_to_cpu(p->header.u.h_seq);
+ dlm_receive_ack(node, seq);
+ break;
+ default:
+ log_print("unsupported h_cmd received: %u, will skip this message from node %d",
+ p->header.h_cmd, nodeid);
+ break;
+ }
+
+out:
+ srcu_read_unlock(&nodes_srcu, idx);
+}
+
+static int dlm_midcomms_version_check_3_1(struct midcomms_node *node)
+{
+ switch (node->version) {
+ case DLM_VERSION_NOT_SET:
+ node->version = DLM_VERSION_3_1;
+ log_print("version 0x%08x for node %d detected", DLM_VERSION_3_1,
+ node->nodeid);
+ break;
+ case DLM_VERSION_3_1:
+ break;
+ default:
+ log_print_ratelimited("version mismatch detected, assumed 0x%08x but node %d has 0x%08x",
+ DLM_VERSION_3_1, node->nodeid, node->version);
+ return -1;
+ }
+
+ return 0;
+}
+
+static void dlm_midcomms_receive_buffer_3_1(union dlm_packet *p, int nodeid)
+{
+ uint16_t msglen = le16_to_cpu(p->header.h_length);
+ struct midcomms_node *node;
+ int idx;
+
+ idx = srcu_read_lock(&nodes_srcu);
+ node = dlm_midcomms_recv_node_lookup(nodeid, p, msglen,
+ dlm_midcomms_version_check_3_1);
+ if (!node) {
+ srcu_read_unlock(&nodes_srcu, idx);
+ return;
+ }
+ srcu_read_unlock(&nodes_srcu, idx);
+
+ switch (p->header.h_cmd) {
+ case DLM_RCOM:
+ /* length already checked */
+ break;
+ case DLM_MSG:
+ if (msglen < sizeof(struct dlm_message)) {
+ log_print("msg too small: %u, will skip this message from node %d",
+ msglen, nodeid);
+ return;
+ }
+
+ break;
+ default:
+ log_print("unsupported h_cmd received: %u, will skip this message from node %d",
+ p->header.h_cmd, nodeid);
+ return;
+ }
+
+ dlm_receive_buffer(p, nodeid);
+}
+
/*
* Called from the low-level comms layer to process a buffer of
* commands.
@@ -43,7 +887,7 @@ int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int len)
while (len >= sizeof(struct dlm_header)) {
hd = (struct dlm_header *)ptr;
- /* no message should be more than DEFAULT_BUFFER_SIZE or
+ /* no message should be more than DLM_MAX_SOCKET_BUFSIZE or
* less than dlm_header size.
*
* Some messages does not have a 8 byte length boundary yet
@@ -55,7 +899,7 @@ int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int len)
* the next major version bump.
*/
msglen = le16_to_cpu(hd->h_length);
- if (msglen > DEFAULT_BUFFER_SIZE ||
+ if (msglen > DLM_MAX_SOCKET_BUFSIZE ||
msglen < sizeof(struct dlm_header)) {
log_print("received invalid length header: %u from node %d, will abort message parsing",
msglen, nodeid);
@@ -68,32 +912,19 @@ int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int len)
if (msglen > len)
break;
- switch (hd->h_cmd) {
- case DLM_MSG:
- if (msglen < sizeof(struct dlm_message)) {
- log_print("dlm msg too small: %u, will skip this message",
- msglen);
- goto skip;
- }
-
+ switch (le32_to_cpu(hd->h_version)) {
+ case DLM_VERSION_3_1:
+ dlm_midcomms_receive_buffer_3_1((union dlm_packet *)ptr, nodeid);
break;
- case DLM_RCOM:
- if (msglen < sizeof(struct dlm_rcom)) {
- log_print("dlm rcom msg too small: %u, will skip this message",
- msglen);
- goto skip;
- }
-
+ case DLM_VERSION_3_2:
+ dlm_midcomms_receive_buffer_3_2((union dlm_packet *)ptr, nodeid);
break;
default:
- log_print("unsupported h_cmd received: %u, will skip this message",
- hd->h_cmd);
- goto skip;
+ log_print("received invalid version header: %u from node %d, will skip this message",
+ le32_to_cpu(hd->h_version), nodeid);
+ break;
}
- dlm_receive_buffer((union dlm_packet *)ptr, nodeid);
-
-skip:
ret += msglen;
len -= msglen;
ptr += msglen;
@@ -102,3 +933,455 @@ skip:
return ret;
}
+void dlm_midcomms_unack_msg_resend(int nodeid)
+{
+ struct midcomms_node *node;
+ struct dlm_mhandle *mh;
+ int idx, ret;
+
+ idx = srcu_read_lock(&nodes_srcu);
+ node = nodeid2node(nodeid, 0);
+ if (!node) {
+ srcu_read_unlock(&nodes_srcu, idx);
+ return;
+ }
+
+ /* old protocol, we don't support to retransmit on failure */
+ switch (node->version) {
+ case DLM_VERSION_3_2:
+ break;
+ default:
+ srcu_read_unlock(&nodes_srcu, idx);
+ return;
+ }
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(mh, &node->send_queue, list) {
+ if (!mh->committed)
+ continue;
+
+ ret = dlm_lowcomms_resend_msg(mh->msg);
+ if (!ret)
+ log_print_ratelimited("retransmit dlm msg, seq %u, nodeid %d",
+ mh->seq, node->nodeid);
+ }
+ rcu_read_unlock();
+ srcu_read_unlock(&nodes_srcu, idx);
+}
+
+static void dlm_fill_opts_header(struct dlm_opts *opts, uint16_t inner_len,
+ uint32_t seq)
+{
+ opts->o_header.h_cmd = DLM_OPTS;
+ opts->o_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
+ opts->o_header.h_nodeid = dlm_our_nodeid();
+ opts->o_header.h_length = DLM_MIDCOMMS_OPT_LEN + inner_len;
+ opts->o_header.u.h_seq = seq;
+ header_out(&opts->o_header);
+}
+
+static void midcomms_new_msg_cb(struct dlm_mhandle *mh)
+{
+ atomic_inc(&mh->node->send_queue_cnt);
+
+ spin_lock(&mh->node->send_queue_lock);
+ list_add_tail_rcu(&mh->list, &mh->node->send_queue);
+ spin_unlock(&mh->node->send_queue_lock);
+
+ mh->seq = mh->node->seq_send++;
+}
+
+static struct dlm_msg *dlm_midcomms_get_msg_3_2(struct dlm_mhandle *mh, int nodeid,
+ int len, gfp_t allocation, char **ppc)
+{
+ struct dlm_opts *opts;
+ struct dlm_msg *msg;
+
+ msg = dlm_lowcomms_new_msg(nodeid, len + DLM_MIDCOMMS_OPT_LEN,
+ allocation, ppc, midcomms_new_msg_cb, mh);
+ if (!msg)
+ return NULL;
+
+ opts = (struct dlm_opts *)*ppc;
+ mh->opts = opts;
+
+ /* add possible options here */
+ dlm_fill_opts_header(opts, len, mh->seq);
+
+ *ppc += sizeof(*opts);
+ mh->inner_hd = (const struct dlm_header *)*ppc;
+ return msg;
+}
+
+struct dlm_mhandle *dlm_midcomms_get_mhandle(int nodeid, int len,
+ gfp_t allocation, char **ppc)
+{
+ struct midcomms_node *node;
+ struct dlm_mhandle *mh;
+ struct dlm_msg *msg;
+ int idx;
+
+ idx = srcu_read_lock(&nodes_srcu);
+ node = nodeid2node(nodeid, 0);
+ if (!node) {
+ WARN_ON_ONCE(1);
+ goto err;
+ }
+
+ /* this is a bug, however we going on and hope it will be resolved */
+ WARN_ON(test_bit(DLM_NODE_FLAG_STOP_TX, &node->flags));
+
+ mh = kzalloc(sizeof(*mh), GFP_NOFS);
+ if (!mh)
+ goto err;
+
+ mh->idx = idx;
+ mh->node = node;
+
+ switch (node->version) {
+ case DLM_VERSION_3_1:
+ msg = dlm_lowcomms_new_msg(nodeid, len, allocation, ppc,
+ NULL, NULL);
+ if (!msg) {
+ kfree(mh);
+ goto err;
+ }
+
+ break;
+ case DLM_VERSION_3_2:
+ msg = dlm_midcomms_get_msg_3_2(mh, nodeid, len, allocation,
+ ppc);
+ if (!msg) {
+ kfree(mh);
+ goto err;
+ }
+
+ break;
+ default:
+ kfree(mh);
+ WARN_ON(1);
+ goto err;
+ }
+
+ mh->msg = msg;
+
+ /* keep in mind that is a must to call
+ * dlm_midcomms_commit_msg() which releases
+ * nodes_srcu using mh->idx which is assumed
+ * here that the application will call it.
+ */
+ return mh;
+
+err:
+ srcu_read_unlock(&nodes_srcu, idx);
+ return NULL;
+}
+
+static void dlm_midcomms_commit_msg_3_2(struct dlm_mhandle *mh)
+{
+ /* nexthdr chain for fast lookup */
+ mh->opts->o_nextcmd = mh->inner_hd->h_cmd;
+ mh->committed = true;
+ dlm_lowcomms_commit_msg(mh->msg);
+}
+
+void dlm_midcomms_commit_mhandle(struct dlm_mhandle *mh)
+{
+ switch (mh->node->version) {
+ case DLM_VERSION_3_1:
+ srcu_read_unlock(&nodes_srcu, mh->idx);
+
+ dlm_lowcomms_commit_msg(mh->msg);
+ dlm_lowcomms_put_msg(mh->msg);
+ /* mh is not part of rcu list in this case */
+ kfree(mh);
+ break;
+ case DLM_VERSION_3_2:
+ dlm_midcomms_commit_msg_3_2(mh);
+ srcu_read_unlock(&nodes_srcu, mh->idx);
+ break;
+ default:
+ srcu_read_unlock(&nodes_srcu, mh->idx);
+ WARN_ON(1);
+ break;
+ }
+}
+
+int dlm_midcomms_start(void)
+{
+ int i;
+
+ for (i = 0; i < CONN_HASH_SIZE; i++)
+ INIT_HLIST_HEAD(&node_hash[i]);
+
+ return dlm_lowcomms_start();
+}
+
+static void dlm_act_fin_ack_rcv(struct midcomms_node *node)
+{
+ spin_lock(&node->state_lock);
+ pr_debug("receive active fin ack from node %d with state %s\n",
+ node->nodeid, dlm_state_str(node->state));
+
+ switch (node->state) {
+ case DLM_FIN_WAIT1:
+ node->state = DLM_FIN_WAIT2;
+ pr_debug("switch node %d to state %s\n",
+ node->nodeid, dlm_state_str(node->state));
+ break;
+ case DLM_CLOSING:
+ midcomms_node_reset(node);
+ pr_debug("switch node %d to state %s\n",
+ node->nodeid, dlm_state_str(node->state));
+ wake_up(&node->shutdown_wait);
+ break;
+ case DLM_CLOSED:
+ /* not valid but somehow we got what we want */
+ wake_up(&node->shutdown_wait);
+ break;
+ default:
+ spin_unlock(&node->state_lock);
+ log_print("%s: unexpected state: %d\n",
+ __func__, node->state);
+ WARN_ON(1);
+ return;
+ }
+ spin_unlock(&node->state_lock);
+}
+
+void dlm_midcomms_add_member(int nodeid)
+{
+ struct midcomms_node *node;
+ int idx;
+
+ if (nodeid == dlm_our_nodeid())
+ return;
+
+ idx = srcu_read_lock(&nodes_srcu);
+ node = nodeid2node(nodeid, GFP_NOFS);
+ if (!node) {
+ srcu_read_unlock(&nodes_srcu, idx);
+ return;
+ }
+
+ spin_lock(&node->state_lock);
+ if (!node->users) {
+ pr_debug("receive add member from node %d with state %s\n",
+ node->nodeid, dlm_state_str(node->state));
+ switch (node->state) {
+ case DLM_ESTABLISHED:
+ break;
+ case DLM_CLOSED:
+ node->state = DLM_ESTABLISHED;
+ pr_debug("switch node %d to state %s\n",
+ node->nodeid, dlm_state_str(node->state));
+ break;
+ default:
+ /* some invalid state passive shutdown
+ * was failed, we try to reset and
+ * hope it will go on.
+ */
+ log_print("reset node %d because shutdown stuck",
+ node->nodeid);
+
+ midcomms_node_reset(node);
+ node->state = DLM_ESTABLISHED;
+ break;
+ }
+ }
+
+ node->users++;
+ pr_debug("users inc count %d\n", node->users);
+ spin_unlock(&node->state_lock);
+
+ srcu_read_unlock(&nodes_srcu, idx);
+}
+
+void dlm_midcomms_remove_member(int nodeid)
+{
+ struct midcomms_node *node;
+ int idx;
+
+ if (nodeid == dlm_our_nodeid())
+ return;
+
+ idx = srcu_read_lock(&nodes_srcu);
+ node = nodeid2node(nodeid, 0);
+ if (!node) {
+ srcu_read_unlock(&nodes_srcu, idx);
+ return;
+ }
+
+ spin_lock(&node->state_lock);
+ node->users--;
+ pr_debug("users dec count %d\n", node->users);
+
+ /* hitting users count to zero means the
+ * other side is running dlm_midcomms_stop()
+ * we meet us to have a clean disconnect.
+ */
+ if (node->users == 0) {
+ pr_debug("receive remove member from node %d with state %s\n",
+ node->nodeid, dlm_state_str(node->state));
+ switch (node->state) {
+ case DLM_ESTABLISHED:
+ break;
+ case DLM_CLOSE_WAIT:
+ /* passive shutdown DLM_LAST_ACK case 2 */
+ node->state = DLM_LAST_ACK;
+ spin_unlock(&node->state_lock);
+
+ pr_debug("switch node %d to state %s case 2\n",
+ node->nodeid, dlm_state_str(node->state));
+ goto send_fin;
+ case DLM_LAST_ACK:
+ /* probably receive fin caught it, do nothing */
+ break;
+ case DLM_CLOSED:
+ /* already gone, do nothing */
+ break;
+ default:
+ log_print("%s: unexpected state: %d\n",
+ __func__, node->state);
+ break;
+ }
+ }
+ spin_unlock(&node->state_lock);
+
+ srcu_read_unlock(&nodes_srcu, idx);
+ return;
+
+send_fin:
+ set_bit(DLM_NODE_FLAG_STOP_RX, &node->flags);
+ dlm_send_fin(node, dlm_pas_fin_ack_rcv);
+ srcu_read_unlock(&nodes_srcu, idx);
+}
+
+static void midcomms_node_release(struct rcu_head *rcu)
+{
+ struct midcomms_node *node = container_of(rcu, struct midcomms_node, rcu);
+
+ WARN_ON(atomic_read(&node->send_queue_cnt));
+ kfree(node);
+}
+
+static void midcomms_shutdown(struct midcomms_node *node)
+{
+ int ret;
+
+ /* old protocol, we don't wait for pending operations */
+ switch (node->version) {
+ case DLM_VERSION_3_2:
+ break;
+ default:
+ return;
+ }
+
+ spin_lock(&node->state_lock);
+ pr_debug("receive active shutdown for node %d with state %s\n",
+ node->nodeid, dlm_state_str(node->state));
+ switch (node->state) {
+ case DLM_ESTABLISHED:
+ node->state = DLM_FIN_WAIT1;
+ pr_debug("switch node %d to state %s case 2\n",
+ node->nodeid, dlm_state_str(node->state));
+ break;
+ case DLM_CLOSED:
+ /* we have what we want */
+ spin_unlock(&node->state_lock);
+ return;
+ default:
+ /* busy to enter DLM_FIN_WAIT1, wait until passive
+ * done in shutdown_wait to enter DLM_CLOSED.
+ */
+ break;
+ }
+ spin_unlock(&node->state_lock);
+
+ if (node->state == DLM_FIN_WAIT1) {
+ dlm_send_fin(node, dlm_act_fin_ack_rcv);
+
+ if (DLM_DEBUG_FENCE_TERMINATION)
+ msleep(5000);
+ }
+
+ /* wait for other side dlm + fin */
+ ret = wait_event_timeout(node->shutdown_wait,
+ node->state == DLM_CLOSED ||
+ test_bit(DLM_NODE_FLAG_CLOSE, &node->flags),
+ DLM_SHUTDOWN_TIMEOUT);
+ if (!ret || test_bit(DLM_NODE_FLAG_CLOSE, &node->flags)) {
+ pr_debug("active shutdown timed out for node %d with state %s\n",
+ node->nodeid, dlm_state_str(node->state));
+ midcomms_node_reset(node);
+ return;
+ }
+
+ pr_debug("active shutdown done for node %d with state %s\n",
+ node->nodeid, dlm_state_str(node->state));
+}
+
+void dlm_midcomms_shutdown(void)
+{
+ struct midcomms_node *node;
+ int i, idx;
+
+ mutex_lock(&close_lock);
+ idx = srcu_read_lock(&nodes_srcu);
+ for (i = 0; i < CONN_HASH_SIZE; i++) {
+ hlist_for_each_entry_rcu(node, &node_hash[i], hlist) {
+ midcomms_shutdown(node);
+
+ dlm_delete_debug_comms_file(node->debugfs);
+
+ spin_lock(&nodes_lock);
+ hlist_del_rcu(&node->hlist);
+ spin_unlock(&nodes_lock);
+
+ call_srcu(&nodes_srcu, &node->rcu, midcomms_node_release);
+ }
+ }
+ srcu_read_unlock(&nodes_srcu, idx);
+ mutex_unlock(&close_lock);
+
+ dlm_lowcomms_shutdown();
+}
+
+int dlm_midcomms_close(int nodeid)
+{
+ struct midcomms_node *node;
+ int idx, ret;
+
+ if (nodeid == dlm_our_nodeid())
+ return 0;
+
+ idx = srcu_read_lock(&nodes_srcu);
+ /* Abort pending close/remove operation */
+ node = nodeid2node(nodeid, 0);
+ if (node) {
+ /* let shutdown waiters leave */
+ set_bit(DLM_NODE_FLAG_CLOSE, &node->flags);
+ wake_up(&node->shutdown_wait);
+ }
+ srcu_read_unlock(&nodes_srcu, idx);
+
+ synchronize_srcu(&nodes_srcu);
+
+ idx = srcu_read_lock(&nodes_srcu);
+ mutex_lock(&close_lock);
+ node = nodeid2node(nodeid, 0);
+ if (!node) {
+ mutex_unlock(&close_lock);
+ srcu_read_unlock(&nodes_srcu, idx);
+ return dlm_lowcomms_close(nodeid);
+ }
+
+ ret = dlm_lowcomms_close(nodeid);
+ spin_lock(&node->state_lock);
+ midcomms_node_reset(node);
+ spin_unlock(&node->state_lock);
+ srcu_read_unlock(&nodes_srcu, idx);
+ mutex_unlock(&close_lock);
+
+ return ret;
+}
diff --git a/fs/dlm/midcomms.h b/fs/dlm/midcomms.h
index 61e90a921849..579abc6929be 100644
--- a/fs/dlm/midcomms.h
+++ b/fs/dlm/midcomms.h
@@ -12,7 +12,22 @@
#ifndef __MIDCOMMS_DOT_H__
#define __MIDCOMMS_DOT_H__
+struct midcomms_node;
+
int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int buflen);
+struct dlm_mhandle *dlm_midcomms_get_mhandle(int nodeid, int len,
+ gfp_t allocation, char **ppc);
+void dlm_midcomms_commit_mhandle(struct dlm_mhandle *mh);
+int dlm_midcomms_close(int nodeid);
+int dlm_midcomms_start(void);
+void dlm_midcomms_shutdown(void);
+void dlm_midcomms_add_member(int nodeid);
+void dlm_midcomms_remove_member(int nodeid);
+void dlm_midcomms_unack_msg_resend(int nodeid);
+const char *dlm_midcomms_state(struct midcomms_node *node);
+unsigned long dlm_midcomms_flags(struct midcomms_node *node);
+int dlm_midcomms_send_queue_cnt(struct midcomms_node *node);
+uint32_t dlm_midcomms_version(struct midcomms_node *node);
#endif /* __MIDCOMMS_DOT_H__ */
diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c
index f5b1bd65728d..5651933f54a4 100644
--- a/fs/dlm/rcom.c
+++ b/fs/dlm/rcom.c
@@ -27,25 +27,15 @@ static int rcom_response(struct dlm_ls *ls)
return test_bit(LSFL_RCOM_READY, &ls->ls_flags);
}
-static int create_rcom(struct dlm_ls *ls, int to_nodeid, int type, int len,
- struct dlm_rcom **rc_ret, struct dlm_mhandle **mh_ret)
+static void _create_rcom(struct dlm_ls *ls, int to_nodeid, int type, int len,
+ struct dlm_rcom **rc_ret, char *mb, int mb_len)
{
struct dlm_rcom *rc;
- struct dlm_mhandle *mh;
- char *mb;
- int mb_len = sizeof(struct dlm_rcom) + len;
-
- mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, GFP_NOFS, &mb);
- if (!mh) {
- log_print("create_rcom to %d type %d len %d ENOBUFS",
- to_nodeid, type, len);
- return -ENOBUFS;
- }
rc = (struct dlm_rcom *) mb;
rc->rc_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
- rc->rc_header.h_lockspace = ls->ls_global_id;
+ rc->rc_header.u.h_lockspace = ls->ls_global_id;
rc->rc_header.h_nodeid = dlm_our_nodeid();
rc->rc_header.h_length = mb_len;
rc->rc_header.h_cmd = DLM_RCOM;
@@ -56,16 +46,67 @@ static int create_rcom(struct dlm_ls *ls, int to_nodeid, int type, int len,
rc->rc_seq = ls->ls_recover_seq;
spin_unlock(&ls->ls_recover_lock);
- *mh_ret = mh;
*rc_ret = rc;
+}
+
+static int create_rcom(struct dlm_ls *ls, int to_nodeid, int type, int len,
+ struct dlm_rcom **rc_ret, struct dlm_mhandle **mh_ret)
+{
+ int mb_len = sizeof(struct dlm_rcom) + len;
+ struct dlm_mhandle *mh;
+ char *mb;
+
+ mh = dlm_midcomms_get_mhandle(to_nodeid, mb_len, GFP_NOFS, &mb);
+ if (!mh) {
+ log_print("%s to %d type %d len %d ENOBUFS",
+ __func__, to_nodeid, type, len);
+ return -ENOBUFS;
+ }
+
+ _create_rcom(ls, to_nodeid, type, len, rc_ret, mb, mb_len);
+ *mh_ret = mh;
+ return 0;
+}
+
+static int create_rcom_stateless(struct dlm_ls *ls, int to_nodeid, int type,
+ int len, struct dlm_rcom **rc_ret,
+ struct dlm_msg **msg_ret)
+{
+ int mb_len = sizeof(struct dlm_rcom) + len;
+ struct dlm_msg *msg;
+ char *mb;
+
+ msg = dlm_lowcomms_new_msg(to_nodeid, mb_len, GFP_NOFS, &mb,
+ NULL, NULL);
+ if (!msg) {
+ log_print("create_rcom to %d type %d len %d ENOBUFS",
+ to_nodeid, type, len);
+ return -ENOBUFS;
+ }
+
+ _create_rcom(ls, to_nodeid, type, len, rc_ret, mb, mb_len);
+ *msg_ret = msg;
return 0;
}
+static void _send_rcom(struct dlm_ls *ls, struct dlm_rcom *rc)
+{
+ dlm_rcom_out(rc);
+}
+
static void send_rcom(struct dlm_ls *ls, struct dlm_mhandle *mh,
struct dlm_rcom *rc)
{
- dlm_rcom_out(rc);
- dlm_lowcomms_commit_buffer(mh);
+ _send_rcom(ls, rc);
+ dlm_midcomms_commit_mhandle(mh);
+}
+
+static void send_rcom_stateless(struct dlm_ls *ls, struct dlm_msg *msg,
+ struct dlm_rcom *rc)
+{
+ _send_rcom(ls, rc);
+ dlm_lowcomms_commit_msg(msg);
+ dlm_lowcomms_put_msg(msg);
}
static void set_rcom_status(struct dlm_ls *ls, struct rcom_status *rs,
@@ -141,7 +182,7 @@ static void disallow_sync_reply(struct dlm_ls *ls)
int dlm_rcom_status(struct dlm_ls *ls, int nodeid, uint32_t status_flags)
{
struct dlm_rcom *rc;
- struct dlm_mhandle *mh;
+ struct dlm_msg *msg;
int error = 0;
ls->ls_recover_nodeid = nodeid;
@@ -153,17 +194,17 @@ int dlm_rcom_status(struct dlm_ls *ls, int nodeid, uint32_t status_flags)
}
retry:
- error = create_rcom(ls, nodeid, DLM_RCOM_STATUS,
- sizeof(struct rcom_status), &rc, &mh);
+ error = create_rcom_stateless(ls, nodeid, DLM_RCOM_STATUS,
+ sizeof(struct rcom_status), &rc, &msg);
if (error)
goto out;
set_rcom_status(ls, (struct rcom_status *)rc->rc_buf, status_flags);
allow_sync_reply(ls, &rc->rc_id);
- memset(ls->ls_recover_buf, 0, LOWCOMMS_MAX_TX_BUFFER_LEN);
+ memset(ls->ls_recover_buf, 0, DLM_MAX_SOCKET_BUFSIZE);
- send_rcom(ls, mh, rc);
+ send_rcom_stateless(ls, msg, rc);
error = dlm_wait_function(ls, &rcom_response);
disallow_sync_reply(ls);
@@ -191,11 +232,11 @@ retry:
static void receive_rcom_status(struct dlm_ls *ls, struct dlm_rcom *rc_in)
{
struct dlm_rcom *rc;
- struct dlm_mhandle *mh;
struct rcom_status *rs;
uint32_t status;
int nodeid = rc_in->rc_header.h_nodeid;
int len = sizeof(struct rcom_config);
+ struct dlm_msg *msg;
int num_slots = 0;
int error;
@@ -218,8 +259,8 @@ static void receive_rcom_status(struct dlm_ls *ls, struct dlm_rcom *rc_in)
len += num_slots * sizeof(struct rcom_slot);
do_create:
- error = create_rcom(ls, nodeid, DLM_RCOM_STATUS_REPLY,
- len, &rc, &mh);
+ error = create_rcom_stateless(ls, nodeid, DLM_RCOM_STATUS_REPLY,
+ len, &rc, &msg);
if (error)
return;
@@ -246,7 +287,7 @@ static void receive_rcom_status(struct dlm_ls *ls, struct dlm_rcom *rc_in)
spin_unlock(&ls->ls_recover_lock);
do_send:
- send_rcom(ls, mh, rc);
+ send_rcom_stateless(ls, msg, rc);
}
static void receive_sync_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
@@ -271,21 +312,22 @@ static void receive_sync_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name, int last_len)
{
struct dlm_rcom *rc;
- struct dlm_mhandle *mh;
+ struct dlm_msg *msg;
int error = 0;
ls->ls_recover_nodeid = nodeid;
retry:
- error = create_rcom(ls, nodeid, DLM_RCOM_NAMES, last_len, &rc, &mh);
+ error = create_rcom_stateless(ls, nodeid, DLM_RCOM_NAMES, last_len,
+ &rc, &msg);
if (error)
goto out;
memcpy(rc->rc_buf, last_name, last_len);
allow_sync_reply(ls, &rc->rc_id);
- memset(ls->ls_recover_buf, 0, LOWCOMMS_MAX_TX_BUFFER_LEN);
+ memset(ls->ls_recover_buf, 0, DLM_MAX_SOCKET_BUFSIZE);
- send_rcom(ls, mh, rc);
+ send_rcom_stateless(ls, msg, rc);
error = dlm_wait_function(ls, &rcom_response);
disallow_sync_reply(ls);
@@ -298,14 +340,15 @@ retry:
static void receive_rcom_names(struct dlm_ls *ls, struct dlm_rcom *rc_in)
{
struct dlm_rcom *rc;
- struct dlm_mhandle *mh;
int error, inlen, outlen, nodeid;
+ struct dlm_msg *msg;
nodeid = rc_in->rc_header.h_nodeid;
inlen = rc_in->rc_header.h_length - sizeof(struct dlm_rcom);
- outlen = LOWCOMMS_MAX_TX_BUFFER_LEN - sizeof(struct dlm_rcom);
+ outlen = DLM_MAX_APP_BUFSIZE - sizeof(struct dlm_rcom);
- error = create_rcom(ls, nodeid, DLM_RCOM_NAMES_REPLY, outlen, &rc, &mh);
+ error = create_rcom_stateless(ls, nodeid, DLM_RCOM_NAMES_REPLY, outlen,
+ &rc, &msg);
if (error)
return;
rc->rc_id = rc_in->rc_id;
@@ -313,7 +356,7 @@ static void receive_rcom_names(struct dlm_ls *ls, struct dlm_rcom *rc_in)
dlm_copy_master_names(ls, rc_in->rc_buf, inlen, rc->rc_buf, outlen,
nodeid);
- send_rcom(ls, mh, rc);
+ send_rcom_stateless(ls, msg, rc);
}
int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid)
@@ -342,10 +385,6 @@ static void receive_rcom_lookup(struct dlm_ls *ls, struct dlm_rcom *rc_in)
int error, ret_nodeid, nodeid = rc_in->rc_header.h_nodeid;
int len = rc_in->rc_header.h_length - sizeof(struct dlm_rcom);
- error = create_rcom(ls, nodeid, DLM_RCOM_LOOKUP_REPLY, 0, &rc, &mh);
- if (error)
- return;
-
/* Old code would send this special id to trigger a debug dump. */
if (rc_in->rc_id == 0xFFFFFFFF) {
log_error(ls, "receive_rcom_lookup dump from %d", nodeid);
@@ -353,6 +392,10 @@ static void receive_rcom_lookup(struct dlm_ls *ls, struct dlm_rcom *rc_in)
return;
}
+ error = create_rcom(ls, nodeid, DLM_RCOM_LOOKUP_REPLY, 0, &rc, &mh);
+ if (error)
+ return;
+
error = dlm_master_lookup(ls, nodeid, rc_in->rc_buf, len,
DLM_LU_RECOVER_MASTER, &ret_nodeid, NULL);
if (error)
@@ -458,14 +501,14 @@ int dlm_send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in)
char *mb;
int mb_len = sizeof(struct dlm_rcom) + sizeof(struct rcom_config);
- mh = dlm_lowcomms_get_buffer(nodeid, mb_len, GFP_NOFS, &mb);
+ mh = dlm_midcomms_get_mhandle(nodeid, mb_len, GFP_NOFS, &mb);
if (!mh)
return -ENOBUFS;
rc = (struct dlm_rcom *) mb;
rc->rc_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
- rc->rc_header.h_lockspace = rc_in->rc_header.h_lockspace;
+ rc->rc_header.u.h_lockspace = rc_in->rc_header.u.h_lockspace;
rc->rc_header.h_nodeid = dlm_our_nodeid();
rc->rc_header.h_length = mb_len;
rc->rc_header.h_cmd = DLM_RCOM;
@@ -479,7 +522,7 @@ int dlm_send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in)
rf->rf_lvblen = cpu_to_le32(~0U);
dlm_rcom_out(rc);
- dlm_lowcomms_commit_buffer(mh);
+ dlm_midcomms_commit_mhandle(mh);
return 0;
}
diff --git a/fs/dlm/util.c b/fs/dlm/util.c
index cfd0d00b19ae..58acbcc2081a 100644
--- a/fs/dlm/util.c
+++ b/fs/dlm/util.c
@@ -20,18 +20,20 @@
#define DLM_ERRNO_ETIMEDOUT 110
#define DLM_ERRNO_EINPROGRESS 115
-static void header_out(struct dlm_header *hd)
+void header_out(struct dlm_header *hd)
{
hd->h_version = cpu_to_le32(hd->h_version);
- hd->h_lockspace = cpu_to_le32(hd->h_lockspace);
+ /* does it for others u32 in union as well */
+ hd->u.h_lockspace = cpu_to_le32(hd->u.h_lockspace);
hd->h_nodeid = cpu_to_le32(hd->h_nodeid);
hd->h_length = cpu_to_le16(hd->h_length);
}
-static void header_in(struct dlm_header *hd)
+void header_in(struct dlm_header *hd)
{
hd->h_version = le32_to_cpu(hd->h_version);
- hd->h_lockspace = le32_to_cpu(hd->h_lockspace);
+ /* does it for others u32 in union as well */
+ hd->u.h_lockspace = le32_to_cpu(hd->u.h_lockspace);
hd->h_nodeid = le32_to_cpu(hd->h_nodeid);
hd->h_length = le16_to_cpu(hd->h_length);
}
diff --git a/fs/dlm/util.h b/fs/dlm/util.h
index cc719ca9397e..d46f23c7a6a0 100644
--- a/fs/dlm/util.h
+++ b/fs/dlm/util.h
@@ -15,6 +15,8 @@ void dlm_message_out(struct dlm_message *ms);
void dlm_message_in(struct dlm_message *ms);
void dlm_rcom_out(struct dlm_rcom *rc);
void dlm_rcom_in(struct dlm_rcom *rc);
+void header_out(struct dlm_header *hd);
+void header_in(struct dlm_header *hd);
#endif
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index 392e721b50a3..7d85e64ea62f 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -533,7 +533,20 @@ static sector_t ecryptfs_bmap(struct address_space *mapping, sector_t block)
return block;
}
+#include <linux/buffer_head.h>
+
const struct address_space_operations ecryptfs_aops = {
+ /*
+ * XXX: This is pretty broken for multiple reasons: ecryptfs does not
+ * actually use buffer_heads, and ecryptfs will crash without
+ * CONFIG_BLOCK. But it matches the behavior before the default for
+ * address_space_operations without the ->set_page_dirty method was
+ * cleaned up, so this is the best we can do without maintainer
+ * feedback.
+ */
+#ifdef CONFIG_BLOCK
+ .set_page_dirty = __set_page_dirty_buffers,
+#endif
.writepage = ecryptfs_writepage,
.readpage = ecryptfs_readpage,
.write_begin = ecryptfs_write_begin,
diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig
index 858b3339f381..906af0c1998c 100644
--- a/fs/erofs/Kconfig
+++ b/fs/erofs/Kconfig
@@ -75,4 +75,3 @@ config EROFS_FS_ZIP
Enable fixed-sized output compression for EROFS.
If you don't want to enable compression feature, say N.
-
diff --git a/fs/erofs/compress.h b/fs/erofs/compress.h
index aea129ddda74..3701c72bacb2 100644
--- a/fs/erofs/compress.h
+++ b/fs/erofs/compress.h
@@ -2,7 +2,6 @@
/*
* Copyright (C) 2019 HUAWEI, Inc.
* https://www.huawei.com/
- * Created by Gao Xiang <gaoxiang25@huawei.com>
*/
#ifndef __EROFS_FS_COMPRESS_H
#define __EROFS_FS_COMPRESS_H
@@ -85,4 +84,3 @@ int z_erofs_decompress(struct z_erofs_decompress_req *rq,
struct list_head *pagepool);
#endif
-
diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index ebac756cb2a3..3787a5fb0a42 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -2,7 +2,6 @@
/*
* Copyright (C) 2017-2018 HUAWEI, Inc.
* https://www.huawei.com/
- * Created by Gao Xiang <gaoxiang25@huawei.com>
*/
#include "internal.h"
#include <linux/prefetch.h>
@@ -315,4 +314,3 @@ const struct address_space_operations erofs_raw_access_aops = {
.readahead = erofs_raw_access_readahead,
.bmap = erofs_bmap,
};
-
diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c
index 88e33addf229..a5bc4b1b7813 100644
--- a/fs/erofs/decompressor.c
+++ b/fs/erofs/decompressor.c
@@ -2,7 +2,6 @@
/*
* Copyright (C) 2019 HUAWEI, Inc.
* https://www.huawei.com/
- * Created by Gao Xiang <gaoxiang25@huawei.com>
*/
#include "compress.h"
#include <linux/module.h>
@@ -407,4 +406,3 @@ int z_erofs_decompress(struct z_erofs_decompress_req *rq,
return z_erofs_shifted_transform(rq, pagepool);
return z_erofs_decompress_generic(rq, pagepool);
}
-
diff --git a/fs/erofs/dir.c b/fs/erofs/dir.c
index 2776bb832127..eee9b0b31b63 100644
--- a/fs/erofs/dir.c
+++ b/fs/erofs/dir.c
@@ -2,7 +2,6 @@
/*
* Copyright (C) 2017-2018 HUAWEI, Inc.
* https://www.huawei.com/
- * Created by Gao Xiang <gaoxiang25@huawei.com>
*/
#include "internal.h"
@@ -139,4 +138,3 @@ const struct file_operations erofs_dir_fops = {
.read = generic_read_dir,
.iterate_shared = erofs_readdir,
};
-
diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h
index 8739d3adf51f..0f8da74570b4 100644
--- a/fs/erofs/erofs_fs.h
+++ b/fs/erofs/erofs_fs.h
@@ -4,7 +4,6 @@
*
* Copyright (C) 2017-2018 HUAWEI, Inc.
* https://www.huawei.com/
- * Created by Gao Xiang <gaoxiang25@huawei.com>
*/
#ifndef __EROFS_FS_H
#define __EROFS_FS_H
@@ -348,4 +347,3 @@ static inline void erofs_check_ondisk_layout_definitions(void)
}
#endif
-
diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index 7ed2d7391692..aa8a0d770ba3 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -2,7 +2,6 @@
/*
* Copyright (C) 2017-2018 HUAWEI, Inc.
* https://www.huawei.com/
- * Created by Gao Xiang <gaoxiang25@huawei.com>
*/
#include "xattr.h"
@@ -374,4 +373,3 @@ const struct inode_operations erofs_fast_symlink_iops = {
.listxattr = erofs_listxattr,
.get_acl = erofs_get_acl,
};
-
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index f92e3e32b9f4..543c2ff97d30 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -2,7 +2,6 @@
/*
* Copyright (C) 2017-2018 HUAWEI, Inc.
* https://www.huawei.com/
- * Created by Gao Xiang <gaoxiang25@huawei.com>
*/
#ifndef __EROFS_INTERNAL_H
#define __EROFS_INTERNAL_H
@@ -469,4 +468,3 @@ static inline int z_erofs_load_lz4_config(struct super_block *sb,
#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */
#endif /* __EROFS_INTERNAL_H */
-
diff --git a/fs/erofs/namei.c b/fs/erofs/namei.c
index 3a81e1f7fc06..a8271ce5e13f 100644
--- a/fs/erofs/namei.c
+++ b/fs/erofs/namei.c
@@ -2,7 +2,6 @@
/*
* Copyright (C) 2017-2018 HUAWEI, Inc.
* https://www.huawei.com/
- * Created by Gao Xiang <gaoxiang25@huawei.com>
*/
#include "xattr.h"
@@ -247,4 +246,3 @@ const struct inode_operations erofs_dir_iops = {
.listxattr = erofs_listxattr,
.get_acl = erofs_get_acl,
};
-
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index bbf3bbd908e0..8fc6c04b54f4 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -2,7 +2,6 @@
/*
* Copyright (C) 2017-2018 HUAWEI, Inc.
* https://www.huawei.com/
- * Created by Gao Xiang <gaoxiang25@huawei.com>
*/
#include <linux/module.h>
#include <linux/buffer_head.h>
@@ -285,6 +284,7 @@ static int erofs_read_superblock(struct super_block *sb)
goto out;
}
+ ret = -EINVAL;
blkszbits = dsb->blkszbits;
/* 9(512 bytes) + LOG_SECTORS_PER_BLOCK == LOG_BLOCK_SIZE */
if (blkszbits != LOG_BLOCK_SIZE) {
@@ -751,4 +751,3 @@ module_exit(erofs_module_exit);
MODULE_DESCRIPTION("Enhanced ROM File System");
MODULE_AUTHOR("Gao Xiang, Chao Yu, Miao Xie, CONSUMER BG, HUAWEI Inc.");
MODULE_LICENSE("GPL");
-
diff --git a/fs/erofs/tagptr.h b/fs/erofs/tagptr.h
index a72897c86744..64ceb7270b5c 100644
--- a/fs/erofs/tagptr.h
+++ b/fs/erofs/tagptr.h
@@ -1,8 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0-only */
/*
* A tagged pointer implementation
- *
- * Copyright (C) 2018 Gao Xiang <gaoxiang25@huawei.com>
*/
#ifndef __EROFS_FS_TAGPTR_H
#define __EROFS_FS_TAGPTR_H
@@ -107,4 +105,3 @@ tagptr_init(o, cmpxchg(&ptptr->v, o.v, n.v)); })
*ptptr; })
#endif /* __EROFS_FS_TAGPTR_H */
-
diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c
index 6758c5b19f7c..bd86067a63f7 100644
--- a/fs/erofs/utils.c
+++ b/fs/erofs/utils.c
@@ -2,7 +2,6 @@
/*
* Copyright (C) 2018 HUAWEI, Inc.
* https://www.huawei.com/
- * Created by Gao Xiang <gaoxiang25@huawei.com>
*/
#include "internal.h"
#include <linux/pagevec.h>
@@ -278,4 +277,3 @@ void erofs_exit_shrinker(void)
unregister_shrinker(&erofs_shrinker_info);
}
#endif /* !CONFIG_EROFS_FS_ZIP */
-
diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c
index 47314a26767a..8dd54b420a1d 100644
--- a/fs/erofs/xattr.c
+++ b/fs/erofs/xattr.c
@@ -2,7 +2,6 @@
/*
* Copyright (C) 2017-2018 HUAWEI, Inc.
* https://www.huawei.com/
- * Created by Gao Xiang <gaoxiang25@huawei.com>
*/
#include <linux/security.h>
#include "xattr.h"
@@ -709,4 +708,3 @@ struct posix_acl *erofs_get_acl(struct inode *inode, int type)
return acl;
}
#endif
-
diff --git a/fs/erofs/xattr.h b/fs/erofs/xattr.h
index 815304bd335f..366dcb400525 100644
--- a/fs/erofs/xattr.h
+++ b/fs/erofs/xattr.h
@@ -2,7 +2,6 @@
/*
* Copyright (C) 2017-2018 HUAWEI, Inc.
* https://www.huawei.com/
- * Created by Gao Xiang <gaoxiang25@huawei.com>
*/
#ifndef __EROFS_XATTR_H
#define __EROFS_XATTR_H
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 78e4b598ecca..cb4d0889eca9 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -2,7 +2,6 @@
/*
* Copyright (C) 2018 HUAWEI, Inc.
* https://www.huawei.com/
- * Created by Gao Xiang <gaoxiang25@huawei.com>
*/
#include "zdata.h"
#include "compress.h"
@@ -380,7 +379,6 @@ static int z_erofs_attach_page(struct z_erofs_collector *clt,
enum z_erofs_page_type type)
{
int ret;
- bool occupied;
/* give priority for inplaceio */
if (clt->mode >= COLLECT_PRIMARY &&
@@ -388,8 +386,7 @@ static int z_erofs_attach_page(struct z_erofs_collector *clt,
z_erofs_try_inplace_io(clt, page))
return 0;
- ret = z_erofs_pagevec_enqueue(&clt->vector,
- page, type, &occupied);
+ ret = z_erofs_pagevec_enqueue(&clt->vector, page, type);
clt->cl->vcnt += (unsigned int)ret;
return ret ? 0 : -EAGAIN;
@@ -1471,4 +1468,3 @@ const struct address_space_operations z_erofs_aops = {
.readpage = z_erofs_readpage,
.readahead = z_erofs_readahead,
};
-
diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h
index 942ee69dff6a..3a008f1b9f78 100644
--- a/fs/erofs/zdata.h
+++ b/fs/erofs/zdata.h
@@ -2,7 +2,6 @@
/*
* Copyright (C) 2018 HUAWEI, Inc.
* https://www.huawei.com/
- * Created by Gao Xiang <gaoxiang25@huawei.com>
*/
#ifndef __EROFS_FS_ZDATA_H
#define __EROFS_FS_ZDATA_H
diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c
index efaf32596b97..f68aea4baed7 100644
--- a/fs/erofs/zmap.c
+++ b/fs/erofs/zmap.c
@@ -2,7 +2,6 @@
/*
* Copyright (C) 2018-2019 HUAWEI, Inc.
* https://www.huawei.com/
- * Created by Gao Xiang <gaoxiang25@huawei.com>
*/
#include "internal.h"
#include <asm/unaligned.h>
@@ -597,4 +596,3 @@ out:
DBG_BUGON(err < 0 && err != -ENOMEM);
return err;
}
-
diff --git a/fs/erofs/zpvec.h b/fs/erofs/zpvec.h
index 1d67cbd38704..dfd7fe0503bb 100644
--- a/fs/erofs/zpvec.h
+++ b/fs/erofs/zpvec.h
@@ -2,7 +2,6 @@
/*
* Copyright (C) 2018 HUAWEI, Inc.
* https://www.huawei.com/
- * Created by Gao Xiang <gaoxiang25@huawei.com>
*/
#ifndef __EROFS_FS_ZPVEC_H
#define __EROFS_FS_ZPVEC_H
@@ -107,10 +106,8 @@ static inline void z_erofs_pagevec_ctor_init(struct z_erofs_pagevec_ctor *ctor,
static inline bool z_erofs_pagevec_enqueue(struct z_erofs_pagevec_ctor *ctor,
struct page *page,
- enum z_erofs_page_type type,
- bool *occupied)
+ enum z_erofs_page_type type)
{
- *occupied = false;
if (!ctor->next && type)
if (ctor->index + 1 == ctor->nr)
return false;
@@ -125,7 +122,6 @@ static inline bool z_erofs_pagevec_enqueue(struct z_erofs_pagevec_ctor *ctor,
/* should remind that collector->next never equal to 1, 2 */
if (type == (uintptr_t)ctor->next) {
ctor->next = page;
- *occupied = true;
}
ctor->pages[ctor->index++] = tagptr_fold(erofs_vtptr_t, page, type);
return true;
@@ -154,4 +150,3 @@ z_erofs_pagevec_dequeue(struct z_erofs_pagevec_ctor *ctor,
return tagptr_unfold_ptr(t);
}
#endif
-
diff --git a/fs/exec.c b/fs/exec.c
index 18594f11c31f..38f63451b928 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -84,9 +84,6 @@ static DEFINE_RWLOCK(binfmt_lock);
void __register_binfmt(struct linux_binfmt * fmt, int insert)
{
- BUG_ON(!fmt);
- if (WARN_ON(!fmt->load_binary))
- return;
write_lock(&binfmt_lock);
insert ? list_add(&fmt->lh, &formats) :
list_add_tail(&fmt->lh, &formats);
@@ -1360,6 +1357,10 @@ int begin_new_exec(struct linux_binprm * bprm)
WRITE_ONCE(me->self_exec_id, me->self_exec_id + 1);
flush_signal_handlers(me, 0);
+ retval = set_cred_ucounts(bprm->cred);
+ if (retval < 0)
+ goto out_unlock;
+
/*
* install the new credentials for this executable
*/
@@ -1874,7 +1875,7 @@ static int do_execveat_common(int fd, struct filename *filename,
* whether NPROC limit is still exceeded.
*/
if ((current->flags & PF_NPROC_EXCEEDED) &&
- atomic_read(&current_user()->processes) > rlimit(RLIMIT_NPROC)) {
+ is_ucounts_overlimit(current_ucounts(), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) {
retval = -EAGAIN;
goto out_ret;
}
diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c
index 1803ef3220fd..ca37d4344361 100644
--- a/fs/exfat/inode.c
+++ b/fs/exfat/inode.c
@@ -491,6 +491,7 @@ int exfat_block_truncate_page(struct inode *inode, loff_t from)
}
static const struct address_space_operations exfat_aops = {
+ .set_page_dirty = __set_page_dirty_buffers,
.readpage = exfat_readpage,
.readahead = exfat_readahead,
.writepage = exfat_writepage,
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 68178b2234bd..dadb121beb22 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -961,6 +961,7 @@ ext2_dax_writepages(struct address_space *mapping, struct writeback_control *wbc
}
const struct address_space_operations ext2_aops = {
+ .set_page_dirty = __set_page_dirty_buffers,
.readpage = ext2_readpage,
.readahead = ext2_readahead,
.writepage = ext2_writepage,
@@ -975,6 +976,7 @@ const struct address_space_operations ext2_aops = {
};
const struct address_space_operations ext2_nobh_aops = {
+ .set_page_dirty = __set_page_dirty_buffers,
.readpage = ext2_readpage,
.readahead = ext2_readahead,
.writepage = ext2_nobh_writepage,
@@ -990,7 +992,7 @@ const struct address_space_operations ext2_nobh_aops = {
static const struct address_space_operations ext2_dax_aops = {
.writepages = ext2_dax_writepages,
.direct_IO = noop_direct_IO,
- .set_page_dirty = noop_set_page_dirty,
+ .set_page_dirty = __set_page_dirty_no_writeback,
.invalidatepage = noop_invalidatepage,
};
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 37002663d521..3c51e243450d 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -720,6 +720,7 @@ enum {
#define EXT4_IOC_CLEAR_ES_CACHE _IO('f', 40)
#define EXT4_IOC_GETSTATE _IOW('f', 41, __u32)
#define EXT4_IOC_GET_ES_CACHE _IOWR('f', 42, struct fiemap)
+#define EXT4_IOC_CHECKPOINT _IOW('f', 43, __u32)
#define EXT4_IOC_SHUTDOWN _IOR ('X', 125, __u32)
@@ -741,6 +742,14 @@ enum {
#define EXT4_STATE_FLAG_NEWENTRY 0x00000004
#define EXT4_STATE_FLAG_DA_ALLOC_CLOSE 0x00000008
+/* flags for ioctl EXT4_IOC_CHECKPOINT */
+#define EXT4_IOC_CHECKPOINT_FLAG_DISCARD 0x1
+#define EXT4_IOC_CHECKPOINT_FLAG_ZEROOUT 0x2
+#define EXT4_IOC_CHECKPOINT_FLAG_DRY_RUN 0x4
+#define EXT4_IOC_CHECKPOINT_FLAG_VALID (EXT4_IOC_CHECKPOINT_FLAG_DISCARD | \
+ EXT4_IOC_CHECKPOINT_FLAG_ZEROOUT | \
+ EXT4_IOC_CHECKPOINT_FLAG_DRY_RUN)
+
#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
/*
* ioctl commands in 32 bit emulation
@@ -1477,7 +1486,7 @@ struct ext4_sb_info {
unsigned int s_inode_goal;
u32 s_hash_seed[4];
int s_def_hash_version;
- int s_hash_unsigned; /* 3 if hash should be signed, 0 if not */
+ int s_hash_unsigned; /* 3 if hash should be unsigned, 0 if not */
struct percpu_counter s_freeclusters_counter;
struct percpu_counter s_freeinodes_counter;
struct percpu_counter s_dirs_counter;
@@ -1488,6 +1497,7 @@ struct ext4_sb_info {
struct kobject s_kobj;
struct completion s_kobj_unregister;
struct super_block *s_sb;
+ struct buffer_head *s_mmp_bh;
/* Journaling */
struct journal_s *s_journal;
@@ -3614,6 +3624,7 @@ extern const struct inode_operations ext4_symlink_inode_operations;
extern const struct inode_operations ext4_fast_symlink_inode_operations;
/* sysfs.c */
+extern void ext4_notify_error_sysfs(struct ext4_sb_info *sbi);
extern int ext4_register_sysfs(struct super_block *sb);
extern void ext4_unregister_sysfs(struct super_block *sb);
extern int __init ext4_init_sysfs(void);
@@ -3720,6 +3731,9 @@ extern struct ext4_io_end_vec *ext4_last_io_end_vec(ext4_io_end_t *io_end);
/* mmp.c */
extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t);
+/* mmp.c */
+extern void ext4_stop_mmpd(struct ext4_sb_info *sbi);
+
/* verity.c */
extern const struct fsverity_operations ext4_verityops;
@@ -3784,7 +3798,7 @@ static inline int ext4_buffer_uptodate(struct buffer_head *bh)
* have to read the block because we may read the old data
* successfully.
*/
- if (!buffer_uptodate(bh) && buffer_write_io_error(bh))
+ if (buffer_write_io_error(bh))
set_buffer_uptodate(bh);
return buffer_uptodate(bh);
}
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 77c84d6f1af6..92ad64b89d9b 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -825,6 +825,7 @@ void ext4_ext_tree_init(handle_t *handle, struct inode *inode)
eh->eh_entries = 0;
eh->eh_magic = EXT4_EXT_MAGIC;
eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0));
+ eh->eh_generation = 0;
ext4_mark_inode_dirty(handle, inode);
}
@@ -1090,6 +1091,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
neh->eh_magic = EXT4_EXT_MAGIC;
neh->eh_depth = 0;
+ neh->eh_generation = 0;
/* move remainder of path[depth] to the new leaf */
if (unlikely(path[depth].p_hdr->eh_entries !=
@@ -1167,6 +1169,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
neh->eh_magic = EXT4_EXT_MAGIC;
neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
neh->eh_depth = cpu_to_le16(depth - i);
+ neh->eh_generation = 0;
fidx = EXT_FIRST_INDEX(neh);
fidx->ei_block = border;
ext4_idx_store_pblock(fidx, oldblock);
@@ -1306,6 +1309,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
neh->eh_magic = EXT4_EXT_MAGIC;
ext4_extent_block_csum_set(inode, neh);
set_buffer_uptodate(bh);
+ set_buffer_verified(bh);
unlock_buffer(bh);
err = ext4_handle_dirty_metadata(handle, inode, bh);
@@ -3206,7 +3210,10 @@ static int ext4_split_extent_at(handle_t *handle,
ext4_ext_mark_unwritten(ex2);
err = ext4_ext_insert_extent(handle, inode, ppath, &newex, flags);
- if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
+ if (err != -ENOSPC && err != -EDQUOT)
+ goto out;
+
+ if (EXT4_EXT_MAY_ZEROOUT & split_flag) {
if (split_flag & (EXT4_EXT_DATA_VALID1|EXT4_EXT_DATA_VALID2)) {
if (split_flag & EXT4_EXT_DATA_VALID1) {
err = ext4_ext_zeroout(inode, ex2);
@@ -3232,25 +3239,22 @@ static int ext4_split_extent_at(handle_t *handle,
ext4_ext_pblock(&orig_ex));
}
- if (err)
- goto fix_extent_len;
- /* update the extent length and mark as initialized */
- ex->ee_len = cpu_to_le16(ee_len);
- ext4_ext_try_to_merge(handle, inode, path, ex);
- err = ext4_ext_dirty(handle, inode, path + path->p_depth);
- if (err)
- goto fix_extent_len;
-
- /* update extent status tree */
- err = ext4_zeroout_es(inode, &zero_ex);
-
- goto out;
- } else if (err)
- goto fix_extent_len;
-
-out:
- ext4_ext_show_leaf(inode, path);
- return err;
+ if (!err) {
+ /* update the extent length and mark as initialized */
+ ex->ee_len = cpu_to_le16(ee_len);
+ ext4_ext_try_to_merge(handle, inode, path, ex);
+ err = ext4_ext_dirty(handle, inode, path + path->p_depth);
+ if (!err)
+ /* update extent status tree */
+ err = ext4_zeroout_es(inode, &zero_ex);
+ /* If we failed at this point, we don't know in which
+ * state the extent tree exactly is so don't try to fix
+ * length of the original extent as it may do even more
+ * damage.
+ */
+ goto out;
+ }
+ }
fix_extent_len:
ex->ee_len = orig_ex.ee_len;
@@ -3260,6 +3264,9 @@ fix_extent_len:
*/
ext4_ext_dirty(handle, inode, path + path->p_depth);
return err;
+out:
+ ext4_ext_show_leaf(inode, path);
+ return err;
}
/*
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index 0a729027322d..9a3a8996aacf 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -1574,11 +1574,9 @@ static unsigned long ext4_es_scan(struct shrinker *shrink,
ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt);
trace_ext4_es_shrink_scan_enter(sbi->s_sb, nr_to_scan, ret);
- if (!nr_to_scan)
- return ret;
-
nr_shrunk = __es_shrink(sbi, nr_to_scan, NULL);
+ ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt);
trace_ext4_es_shrink_scan_exit(sbi->s_sb, nr_shrunk, ret);
return nr_shrunk;
}
diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index f98ca4f37ef6..e8195229c252 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -1288,28 +1288,29 @@ struct dentry_info_args {
};
static inline void tl_to_darg(struct dentry_info_args *darg,
- struct ext4_fc_tl *tl)
+ struct ext4_fc_tl *tl, u8 *val)
{
- struct ext4_fc_dentry_info *fcd;
+ struct ext4_fc_dentry_info fcd;
- fcd = (struct ext4_fc_dentry_info *)ext4_fc_tag_val(tl);
+ memcpy(&fcd, val, sizeof(fcd));
- darg->parent_ino = le32_to_cpu(fcd->fc_parent_ino);
- darg->ino = le32_to_cpu(fcd->fc_ino);
- darg->dname = fcd->fc_dname;
- darg->dname_len = ext4_fc_tag_len(tl) -
- sizeof(struct ext4_fc_dentry_info);
+ darg->parent_ino = le32_to_cpu(fcd.fc_parent_ino);
+ darg->ino = le32_to_cpu(fcd.fc_ino);
+ darg->dname = val + offsetof(struct ext4_fc_dentry_info, fc_dname);
+ darg->dname_len = le16_to_cpu(tl->fc_len) -
+ sizeof(struct ext4_fc_dentry_info);
}
/* Unlink replay function */
-static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl)
+static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl,
+ u8 *val)
{
struct inode *inode, *old_parent;
struct qstr entry;
struct dentry_info_args darg;
int ret = 0;
- tl_to_darg(&darg, tl);
+ tl_to_darg(&darg, tl, val);
trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino,
darg.parent_ino, darg.dname_len);
@@ -1399,13 +1400,14 @@ out:
}
/* Link replay function */
-static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl)
+static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl,
+ u8 *val)
{
struct inode *inode;
struct dentry_info_args darg;
int ret = 0;
- tl_to_darg(&darg, tl);
+ tl_to_darg(&darg, tl, val);
trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino,
darg.parent_ino, darg.dname_len);
@@ -1450,9 +1452,10 @@ static int ext4_fc_record_modified_inode(struct super_block *sb, int ino)
/*
* Inode replay function
*/
-static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl)
+static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl,
+ u8 *val)
{
- struct ext4_fc_inode *fc_inode;
+ struct ext4_fc_inode fc_inode;
struct ext4_inode *raw_inode;
struct ext4_inode *raw_fc_inode;
struct inode *inode = NULL;
@@ -1460,9 +1463,9 @@ static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl)
int inode_len, ino, ret, tag = le16_to_cpu(tl->fc_tag);
struct ext4_extent_header *eh;
- fc_inode = (struct ext4_fc_inode *)ext4_fc_tag_val(tl);
+ memcpy(&fc_inode, val, sizeof(fc_inode));
- ino = le32_to_cpu(fc_inode->fc_ino);
+ ino = le32_to_cpu(fc_inode.fc_ino);
trace_ext4_fc_replay(sb, tag, ino, 0, 0);
inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
@@ -1474,12 +1477,13 @@ static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl)
ext4_fc_record_modified_inode(sb, ino);
- raw_fc_inode = (struct ext4_inode *)fc_inode->fc_raw_inode;
+ raw_fc_inode = (struct ext4_inode *)
+ (val + offsetof(struct ext4_fc_inode, fc_raw_inode));
ret = ext4_get_fc_inode_loc(sb, ino, &iloc);
if (ret)
goto out;
- inode_len = ext4_fc_tag_len(tl) - sizeof(struct ext4_fc_inode);
+ inode_len = le16_to_cpu(tl->fc_len) - sizeof(struct ext4_fc_inode);
raw_inode = ext4_raw_inode(&iloc);
memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block));
@@ -1547,14 +1551,15 @@ out:
* inode for which we are trying to create a dentry here, should already have
* been replayed before we start here.
*/
-static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl)
+static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl,
+ u8 *val)
{
int ret = 0;
struct inode *inode = NULL;
struct inode *dir = NULL;
struct dentry_info_args darg;
- tl_to_darg(&darg, tl);
+ tl_to_darg(&darg, tl, val);
trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino,
darg.parent_ino, darg.dname_len);
@@ -1633,9 +1638,9 @@ static int ext4_fc_record_regions(struct super_block *sb, int ino,
/* Replay add range tag */
static int ext4_fc_replay_add_range(struct super_block *sb,
- struct ext4_fc_tl *tl)
+ struct ext4_fc_tl *tl, u8 *val)
{
- struct ext4_fc_add_range *fc_add_ex;
+ struct ext4_fc_add_range fc_add_ex;
struct ext4_extent newex, *ex;
struct inode *inode;
ext4_lblk_t start, cur;
@@ -1645,15 +1650,14 @@ static int ext4_fc_replay_add_range(struct super_block *sb,
struct ext4_ext_path *path = NULL;
int ret;
- fc_add_ex = (struct ext4_fc_add_range *)ext4_fc_tag_val(tl);
- ex = (struct ext4_extent *)&fc_add_ex->fc_ex;
+ memcpy(&fc_add_ex, val, sizeof(fc_add_ex));
+ ex = (struct ext4_extent *)&fc_add_ex.fc_ex;
trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE,
- le32_to_cpu(fc_add_ex->fc_ino), le32_to_cpu(ex->ee_block),
+ le32_to_cpu(fc_add_ex.fc_ino), le32_to_cpu(ex->ee_block),
ext4_ext_get_actual_len(ex));
- inode = ext4_iget(sb, le32_to_cpu(fc_add_ex->fc_ino),
- EXT4_IGET_NORMAL);
+ inode = ext4_iget(sb, le32_to_cpu(fc_add_ex.fc_ino), EXT4_IGET_NORMAL);
if (IS_ERR(inode)) {
jbd_debug(1, "Inode not found.");
return 0;
@@ -1762,32 +1766,33 @@ next:
/* Replay DEL_RANGE tag */
static int
-ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl)
+ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl,
+ u8 *val)
{
struct inode *inode;
- struct ext4_fc_del_range *lrange;
+ struct ext4_fc_del_range lrange;
struct ext4_map_blocks map;
ext4_lblk_t cur, remaining;
int ret;
- lrange = (struct ext4_fc_del_range *)ext4_fc_tag_val(tl);
- cur = le32_to_cpu(lrange->fc_lblk);
- remaining = le32_to_cpu(lrange->fc_len);
+ memcpy(&lrange, val, sizeof(lrange));
+ cur = le32_to_cpu(lrange.fc_lblk);
+ remaining = le32_to_cpu(lrange.fc_len);
trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE,
- le32_to_cpu(lrange->fc_ino), cur, remaining);
+ le32_to_cpu(lrange.fc_ino), cur, remaining);
- inode = ext4_iget(sb, le32_to_cpu(lrange->fc_ino), EXT4_IGET_NORMAL);
+ inode = ext4_iget(sb, le32_to_cpu(lrange.fc_ino), EXT4_IGET_NORMAL);
if (IS_ERR(inode)) {
- jbd_debug(1, "Inode %d not found", le32_to_cpu(lrange->fc_ino));
+ jbd_debug(1, "Inode %d not found", le32_to_cpu(lrange.fc_ino));
return 0;
}
ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
jbd_debug(1, "DEL_RANGE, inode %ld, lblk %d, len %d\n",
- inode->i_ino, le32_to_cpu(lrange->fc_lblk),
- le32_to_cpu(lrange->fc_len));
+ inode->i_ino, le32_to_cpu(lrange.fc_lblk),
+ le32_to_cpu(lrange.fc_len));
while (remaining > 0) {
map.m_lblk = cur;
map.m_len = remaining;
@@ -1808,8 +1813,8 @@ ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl)
}
ret = ext4_punch_hole(inode,
- le32_to_cpu(lrange->fc_lblk) << sb->s_blocksize_bits,
- le32_to_cpu(lrange->fc_len) << sb->s_blocksize_bits);
+ le32_to_cpu(lrange.fc_lblk) << sb->s_blocksize_bits,
+ le32_to_cpu(lrange.fc_len) << sb->s_blocksize_bits);
if (ret)
jbd_debug(1, "ext4_punch_hole returned %d", ret);
ext4_ext_replay_shrink_inode(inode,
@@ -1925,11 +1930,11 @@ static int ext4_fc_replay_scan(journal_t *journal,
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_fc_replay_state *state;
int ret = JBD2_FC_REPLAY_CONTINUE;
- struct ext4_fc_add_range *ext;
- struct ext4_fc_tl *tl;
- struct ext4_fc_tail *tail;
- __u8 *start, *end;
- struct ext4_fc_head *head;
+ struct ext4_fc_add_range ext;
+ struct ext4_fc_tl tl;
+ struct ext4_fc_tail tail;
+ __u8 *start, *end, *cur, *val;
+ struct ext4_fc_head head;
struct ext4_extent *ex;
state = &sbi->s_fc_replay_state;
@@ -1956,15 +1961,17 @@ static int ext4_fc_replay_scan(journal_t *journal,
}
state->fc_replay_expected_off++;
- fc_for_each_tl(start, end, tl) {
+ for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) {
+ memcpy(&tl, cur, sizeof(tl));
+ val = cur + sizeof(tl);
jbd_debug(3, "Scan phase, tag:%s, blk %lld\n",
- tag2str(le16_to_cpu(tl->fc_tag)), bh->b_blocknr);
- switch (le16_to_cpu(tl->fc_tag)) {
+ tag2str(le16_to_cpu(tl.fc_tag)), bh->b_blocknr);
+ switch (le16_to_cpu(tl.fc_tag)) {
case EXT4_FC_TAG_ADD_RANGE:
- ext = (struct ext4_fc_add_range *)ext4_fc_tag_val(tl);
- ex = (struct ext4_extent *)&ext->fc_ex;
+ memcpy(&ext, val, sizeof(ext));
+ ex = (struct ext4_extent *)&ext.fc_ex;
ret = ext4_fc_record_regions(sb,
- le32_to_cpu(ext->fc_ino),
+ le32_to_cpu(ext.fc_ino),
le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex),
ext4_ext_get_actual_len(ex));
if (ret < 0)
@@ -1978,18 +1985,18 @@ static int ext4_fc_replay_scan(journal_t *journal,
case EXT4_FC_TAG_INODE:
case EXT4_FC_TAG_PAD:
state->fc_cur_tag++;
- state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl,
- sizeof(*tl) + ext4_fc_tag_len(tl));
+ state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
+ sizeof(tl) + le16_to_cpu(tl.fc_len));
break;
case EXT4_FC_TAG_TAIL:
state->fc_cur_tag++;
- tail = (struct ext4_fc_tail *)ext4_fc_tag_val(tl);
- state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl,
- sizeof(*tl) +
+ memcpy(&tail, val, sizeof(tail));
+ state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
+ sizeof(tl) +
offsetof(struct ext4_fc_tail,
fc_crc));
- if (le32_to_cpu(tail->fc_tid) == expected_tid &&
- le32_to_cpu(tail->fc_crc) == state->fc_crc) {
+ if (le32_to_cpu(tail.fc_tid) == expected_tid &&
+ le32_to_cpu(tail.fc_crc) == state->fc_crc) {
state->fc_replay_num_tags = state->fc_cur_tag;
state->fc_regions_valid =
state->fc_regions_used;
@@ -2000,19 +2007,19 @@ static int ext4_fc_replay_scan(journal_t *journal,
state->fc_crc = 0;
break;
case EXT4_FC_TAG_HEAD:
- head = (struct ext4_fc_head *)ext4_fc_tag_val(tl);
- if (le32_to_cpu(head->fc_features) &
+ memcpy(&head, val, sizeof(head));
+ if (le32_to_cpu(head.fc_features) &
~EXT4_FC_SUPPORTED_FEATURES) {
ret = -EOPNOTSUPP;
break;
}
- if (le32_to_cpu(head->fc_tid) != expected_tid) {
+ if (le32_to_cpu(head.fc_tid) != expected_tid) {
ret = JBD2_FC_REPLAY_STOP;
break;
}
state->fc_cur_tag++;
- state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl,
- sizeof(*tl) + ext4_fc_tag_len(tl));
+ state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
+ sizeof(tl) + le16_to_cpu(tl.fc_len));
break;
default:
ret = state->fc_replay_num_tags ?
@@ -2036,11 +2043,11 @@ static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
{
struct super_block *sb = journal->j_private;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct ext4_fc_tl *tl;
- __u8 *start, *end;
+ struct ext4_fc_tl tl;
+ __u8 *start, *end, *cur, *val;
int ret = JBD2_FC_REPLAY_CONTINUE;
struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state;
- struct ext4_fc_tail *tail;
+ struct ext4_fc_tail tail;
if (pass == PASS_SCAN) {
state->fc_current_pass = PASS_SCAN;
@@ -2067,49 +2074,52 @@ static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
start = (u8 *)bh->b_data;
end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
- fc_for_each_tl(start, end, tl) {
+ for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) {
+ memcpy(&tl, cur, sizeof(tl));
+ val = cur + sizeof(tl);
+
if (state->fc_replay_num_tags == 0) {
ret = JBD2_FC_REPLAY_STOP;
ext4_fc_set_bitmaps_and_counters(sb);
break;
}
jbd_debug(3, "Replay phase, tag:%s\n",
- tag2str(le16_to_cpu(tl->fc_tag)));
+ tag2str(le16_to_cpu(tl.fc_tag)));
state->fc_replay_num_tags--;
- switch (le16_to_cpu(tl->fc_tag)) {
+ switch (le16_to_cpu(tl.fc_tag)) {
case EXT4_FC_TAG_LINK:
- ret = ext4_fc_replay_link(sb, tl);
+ ret = ext4_fc_replay_link(sb, &tl, val);
break;
case EXT4_FC_TAG_UNLINK:
- ret = ext4_fc_replay_unlink(sb, tl);
+ ret = ext4_fc_replay_unlink(sb, &tl, val);
break;
case EXT4_FC_TAG_ADD_RANGE:
- ret = ext4_fc_replay_add_range(sb, tl);
+ ret = ext4_fc_replay_add_range(sb, &tl, val);
break;
case EXT4_FC_TAG_CREAT:
- ret = ext4_fc_replay_create(sb, tl);
+ ret = ext4_fc_replay_create(sb, &tl, val);
break;
case EXT4_FC_TAG_DEL_RANGE:
- ret = ext4_fc_replay_del_range(sb, tl);
+ ret = ext4_fc_replay_del_range(sb, &tl, val);
break;
case EXT4_FC_TAG_INODE:
- ret = ext4_fc_replay_inode(sb, tl);
+ ret = ext4_fc_replay_inode(sb, &tl, val);
break;
case EXT4_FC_TAG_PAD:
trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0,
- ext4_fc_tag_len(tl), 0);
+ le16_to_cpu(tl.fc_len), 0);
break;
case EXT4_FC_TAG_TAIL:
trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0,
- ext4_fc_tag_len(tl), 0);
- tail = (struct ext4_fc_tail *)ext4_fc_tag_val(tl);
- WARN_ON(le32_to_cpu(tail->fc_tid) != expected_tid);
+ le16_to_cpu(tl.fc_len), 0);
+ memcpy(&tail, val, sizeof(tail));
+ WARN_ON(le32_to_cpu(tail.fc_tid) != expected_tid);
break;
case EXT4_FC_TAG_HEAD:
break;
default:
- trace_ext4_fc_replay(sb, le16_to_cpu(tl->fc_tag), 0,
- ext4_fc_tag_len(tl), 0);
+ trace_ext4_fc_replay(sb, le16_to_cpu(tl.fc_tag), 0,
+ le16_to_cpu(tl.fc_len), 0);
ret = -ECANCELED;
break;
}
diff --git a/fs/ext4/fast_commit.h b/fs/ext4/fast_commit.h
index b77f70f55a62..937c381b4c85 100644
--- a/fs/ext4/fast_commit.h
+++ b/fs/ext4/fast_commit.h
@@ -153,13 +153,6 @@ struct ext4_fc_replay_state {
#define region_last(__region) (((__region)->lblk) + ((__region)->len) - 1)
#endif
-#define fc_for_each_tl(__start, __end, __tl) \
- for (tl = (struct ext4_fc_tl *)(__start); \
- (__u8 *)tl < (__u8 *)(__end); \
- tl = (struct ext4_fc_tl *)((__u8 *)tl + \
- sizeof(struct ext4_fc_tl) + \
- + le16_to_cpu(tl->fc_len)))
-
static inline const char *tag2str(__u16 tag)
{
switch (tag) {
@@ -186,16 +179,4 @@ static inline const char *tag2str(__u16 tag)
}
}
-/* Get length of a particular tlv */
-static inline int ext4_fc_tag_len(struct ext4_fc_tl *tl)
-{
- return le16_to_cpu(tl->fc_len);
-}
-
-/* Get a pointer to "value" of a tlv */
-static inline __u8 *ext4_fc_tag_val(struct ext4_fc_tl *tl)
-{
- return (__u8 *)tl + sizeof(*tl);
-}
-
#endif /* __FAST_COMMIT_H__ */
diff --git a/fs/ext4/fsmap.h b/fs/ext4/fsmap.h
index 68c8001fee85..ac642be2302e 100644
--- a/fs/ext4/fsmap.h
+++ b/fs/ext4/fsmap.h
@@ -50,7 +50,7 @@ int ext4_getfsmap(struct super_block *sb, struct ext4_fsmap_head *head,
#define EXT4_FMR_OWN_INODES FMR_OWNER('X', 5) /* inodes */
#define EXT4_FMR_OWN_GDT FMR_OWNER('f', 1) /* group descriptors */
#define EXT4_FMR_OWN_RESV_GDT FMR_OWNER('f', 2) /* reserved gdt blocks */
-#define EXT4_FMR_OWN_BLKBM FMR_OWNER('f', 3) /* inode bitmap */
-#define EXT4_FMR_OWN_INOBM FMR_OWNER('f', 4) /* block bitmap */
+#define EXT4_FMR_OWN_BLKBM FMR_OWNER('f', 3) /* block bitmap */
+#define EXT4_FMR_OWN_INOBM FMR_OWNER('f', 4) /* inode bitmap */
#endif /* __EXT4_FSMAP_H__ */
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 81a17a3cd80e..e89fc0f770b0 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -322,14 +322,16 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
if (is_directory) {
count = ext4_used_dirs_count(sb, gdp) - 1;
ext4_used_dirs_set(sb, gdp, count);
- percpu_counter_dec(&sbi->s_dirs_counter);
+ if (percpu_counter_initialized(&sbi->s_dirs_counter))
+ percpu_counter_dec(&sbi->s_dirs_counter);
}
ext4_inode_bitmap_csum_set(sb, block_group, gdp, bitmap_bh,
EXT4_INODES_PER_GROUP(sb) / 8);
ext4_group_desc_csum_set(sb, block_group, gdp);
ext4_unlock_group(sb, block_group);
- percpu_counter_inc(&sbi->s_freeinodes_counter);
+ if (percpu_counter_initialized(&sbi->s_freeinodes_counter))
+ percpu_counter_inc(&sbi->s_freeinodes_counter);
if (sbi->s_log_groups_per_flex) {
struct flex_groups *fg;
@@ -400,7 +402,7 @@ static void get_orlov_stats(struct super_block *sb, ext4_group_t g,
*
* We always try to spread first-level directories.
*
- * If there are blockgroups with both free inodes and free blocks counts
+ * If there are blockgroups with both free inodes and free clusters counts
* not worse than average we return one with smallest directory count.
* Otherwise we simply return a random group.
*
@@ -409,7 +411,7 @@ static void get_orlov_stats(struct super_block *sb, ext4_group_t g,
* It's OK to put directory into a group unless
* it has too many directories already (max_dirs) or
* it has too few free inodes left (min_inodes) or
- * it has too few free blocks left (min_blocks) or
+ * it has too few free clusters left (min_clusters) or
* Parent's group is preferred, if it doesn't satisfy these
* conditions we search cyclically through the rest. If none
* of the groups look good we just look for a group with more
@@ -425,7 +427,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
ext4_group_t real_ngroups = ext4_get_groups_count(sb);
int inodes_per_group = EXT4_INODES_PER_GROUP(sb);
unsigned int freei, avefreei, grp_free;
- ext4_fsblk_t freeb, avefreec;
+ ext4_fsblk_t freec, avefreec;
unsigned int ndirs;
int max_dirs, min_inodes;
ext4_grpblk_t min_clusters;
@@ -444,9 +446,8 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter);
avefreei = freei / ngroups;
- freeb = EXT4_C2B(sbi,
- percpu_counter_read_positive(&sbi->s_freeclusters_counter));
- avefreec = freeb;
+ freec = percpu_counter_read_positive(&sbi->s_freeclusters_counter);
+ avefreec = freec;
do_div(avefreec, ngroups);
ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter);
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 3cf01629010d..70cb64db33f7 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -204,7 +204,7 @@ out:
/*
* write the buffer to the inline inode.
* If 'create' is set, we don't need to do the extra copy in the xattr
- * value since it is already handled by ext4_xattr_ibody_inline_set.
+ * value since it is already handled by ext4_xattr_ibody_set.
* That saves us one memcpy.
*/
static void ext4_write_inline_data(struct inode *inode, struct ext4_iloc *iloc,
@@ -286,7 +286,7 @@ static int ext4_create_inline_data(handle_t *handle,
BUG_ON(!is.s.not_found);
- error = ext4_xattr_ibody_inline_set(handle, inode, &i, &is);
+ error = ext4_xattr_ibody_set(handle, inode, &i, &is);
if (error) {
if (error == -ENOSPC)
ext4_clear_inode_state(inode,
@@ -358,7 +358,7 @@ static int ext4_update_inline_data(handle_t *handle, struct inode *inode,
i.value = value;
i.value_len = len;
- error = ext4_xattr_ibody_inline_set(handle, inode, &i, &is);
+ error = ext4_xattr_ibody_set(handle, inode, &i, &is);
if (error)
goto out;
@@ -431,7 +431,7 @@ static int ext4_destroy_inline_data_nolock(handle_t *handle,
if (error)
goto out;
- error = ext4_xattr_ibody_inline_set(handle, inode, &i, &is);
+ error = ext4_xattr_ibody_set(handle, inode, &i, &is);
if (error)
goto out;
@@ -1925,8 +1925,7 @@ int ext4_inline_data_truncate(struct inode *inode, int *has_inline)
i.value = value;
i.value_len = i_size > EXT4_MIN_INLINE_DATA_SIZE ?
i_size - EXT4_MIN_INLINE_DATA_SIZE : 0;
- err = ext4_xattr_ibody_inline_set(handle, inode,
- &i, &is);
+ err = ext4_xattr_ibody_set(handle, inode, &i, &is);
if (err)
goto out_error;
}
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index fe6045a46599..d8de607849df 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -374,7 +374,7 @@ void ext4_da_update_reserve_space(struct inode *inode,
ei->i_reserved_data_blocks -= used;
percpu_counter_sub(&sbi->s_dirtyclusters_counter, used);
- spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+ spin_unlock(&ei->i_block_reservation_lock);
/* Update quota subsystem for data blocks */
if (quota_claim)
@@ -3223,7 +3223,7 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
ext4_clear_inode_state(inode, EXT4_STATE_JDATA);
journal = EXT4_JOURNAL(inode);
jbd2_journal_lock_updates(journal);
- err = jbd2_journal_flush(journal);
+ err = jbd2_journal_flush(journal, 0);
jbd2_journal_unlock_updates(journal);
if (err)
@@ -3418,7 +3418,7 @@ retry:
* i_disksize out to i_size. This could be beyond where direct I/O is
* happening and thus expose allocated blocks to direct I/O reads.
*/
- else if ((map->m_lblk * (1 << blkbits)) >= i_size_read(inode))
+ else if (((loff_t)map->m_lblk << blkbits) >= i_size_read(inode))
m_flags = EXT4_GET_BLOCKS_CREATE;
else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
m_flags = EXT4_GET_BLOCKS_IO_CREATE_EXT;
@@ -3701,7 +3701,7 @@ static const struct address_space_operations ext4_da_aops = {
static const struct address_space_operations ext4_dax_aops = {
.writepages = ext4_dax_writepages,
.direct_IO = noop_direct_IO,
- .set_page_dirty = noop_set_page_dirty,
+ .set_page_dirty = __set_page_dirty_no_writeback,
.bmap = ext4_bmap,
.invalidatepage = noop_invalidatepage,
.swap_activate = ext4_iomap_swap_activate,
@@ -6005,7 +6005,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
if (val)
ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
else {
- err = jbd2_journal_flush(journal);
+ err = jbd2_journal_flush(journal, 0);
if (err < 0) {
jbd2_journal_unlock_updates(journal);
percpu_up_write(&sbi->s_writepages_rwsem);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 31627f7dc5cd..e27f34bceb8d 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -659,10 +659,9 @@ static int ext4_ioc_getfsmap(struct super_block *sb,
info.gi_sb = sb;
info.gi_data = arg;
error = ext4_getfsmap(sb, &xhead, ext4_getfsmap_format, &info);
- if (error == EXT4_QUERY_RANGE_ABORT) {
- error = 0;
+ if (error == EXT4_QUERY_RANGE_ABORT)
aborted = true;
- } else if (error)
+ else if (error)
return error;
/* If we didn't abort, set the "last" flag in the last fmx */
@@ -693,13 +692,6 @@ static long ext4_ioctl_group_add(struct file *file,
if (err)
return err;
- if (ext4_has_feature_bigalloc(sb)) {
- ext4_msg(sb, KERN_ERR,
- "Online resizing not supported with bigalloc");
- err = -EOPNOTSUPP;
- goto group_add_out;
- }
-
err = mnt_want_write_file(file);
if (err)
goto group_add_out;
@@ -707,7 +699,7 @@ static long ext4_ioctl_group_add(struct file *file,
err = ext4_group_add(sb, input);
if (EXT4_SB(sb)->s_journal) {
jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
- err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
+ err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal, 0);
jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
}
if (err == 0)
@@ -800,6 +792,57 @@ static int ext4_ioctl_get_es_cache(struct file *filp, unsigned long arg)
return error;
}
+static int ext4_ioctl_checkpoint(struct file *filp, unsigned long arg)
+{
+ int err = 0;
+ __u32 flags = 0;
+ unsigned int flush_flags = 0;
+ struct super_block *sb = file_inode(filp)->i_sb;
+ struct request_queue *q;
+
+ if (copy_from_user(&flags, (__u32 __user *)arg,
+ sizeof(__u32)))
+ return -EFAULT;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ /* check for invalid bits set */
+ if ((flags & ~EXT4_IOC_CHECKPOINT_FLAG_VALID) ||
+ ((flags & JBD2_JOURNAL_FLUSH_DISCARD) &&
+ (flags & JBD2_JOURNAL_FLUSH_ZEROOUT)))
+ return -EINVAL;
+
+ if (!EXT4_SB(sb)->s_journal)
+ return -ENODEV;
+
+ if (flags & ~JBD2_JOURNAL_FLUSH_VALID)
+ return -EINVAL;
+
+ q = bdev_get_queue(EXT4_SB(sb)->s_journal->j_dev);
+ if (!q)
+ return -ENXIO;
+ if ((flags & JBD2_JOURNAL_FLUSH_DISCARD) && !blk_queue_discard(q))
+ return -EOPNOTSUPP;
+
+ if (flags & EXT4_IOC_CHECKPOINT_FLAG_DRY_RUN)
+ return 0;
+
+ if (flags & EXT4_IOC_CHECKPOINT_FLAG_DISCARD)
+ flush_flags |= JBD2_JOURNAL_FLUSH_DISCARD;
+
+ if (flags & EXT4_IOC_CHECKPOINT_FLAG_ZEROOUT) {
+ flush_flags |= JBD2_JOURNAL_FLUSH_ZEROOUT;
+ pr_info_ratelimited("warning: checkpointing journal with EXT4_IOC_CHECKPOINT_FLAG_ZEROOUT can be slow");
+ }
+
+ jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
+ err = jbd2_journal_flush(EXT4_SB(sb)->s_journal, flush_flags);
+ jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+
+ return err;
+}
+
static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
struct inode *inode = file_inode(filp);
@@ -871,13 +914,6 @@ setversion_out:
goto group_extend_out;
}
- if (ext4_has_feature_bigalloc(sb)) {
- ext4_msg(sb, KERN_ERR,
- "Online resizing not supported with bigalloc");
- err = -EOPNOTSUPP;
- goto group_extend_out;
- }
-
err = mnt_want_write_file(filp);
if (err)
goto group_extend_out;
@@ -885,7 +921,7 @@ setversion_out:
err = ext4_group_extend(sb, EXT4_SB(sb)->s_es, n_blocks_count);
if (EXT4_SB(sb)->s_journal) {
jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
- err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
+ err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal, 0);
jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
}
if (err == 0)
@@ -1028,7 +1064,7 @@ mext_out:
if (EXT4_SB(sb)->s_journal) {
ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_RESIZE);
jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
- err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
+ err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal, 0);
jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
}
if (err == 0)
@@ -1211,6 +1247,9 @@ resizefs_out:
return fsverity_ioctl_read_metadata(filp,
(const void __user *)arg);
+ case EXT4_IOC_CHECKPOINT:
+ return ext4_ioctl_checkpoint(filp, arg);
+
default:
return -ENOTTY;
}
@@ -1291,6 +1330,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case EXT4_IOC_CLEAR_ES_CACHE:
case EXT4_IOC_GETSTATE:
case EXT4_IOC_GET_ES_CACHE:
+ case EXT4_IOC_CHECKPOINT:
break;
default:
return -ENOIOCTLCMD;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 3239e6669e84..c2c22c2baac0 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -3217,7 +3217,7 @@ static int ext4_mb_init_backend(struct super_block *sb)
*/
if (sbi->s_es->s_log_groups_per_flex >= 32) {
ext4_msg(sb, KERN_ERR, "too many log groups per flexible block group");
- goto err_freesgi;
+ goto err_freebuddy;
}
sbi->s_mb_prefetch = min_t(uint, 1 << sbi->s_es->s_log_groups_per_flex,
BLK_MAX_SEGMENT_SIZE >> (sb->s_blocksize_bits - 9));
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index 68fbeedd627b..6cb598b549ca 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -127,9 +127,9 @@ void __dump_mmp_msg(struct super_block *sb, struct mmp_struct *mmp,
*/
static int kmmpd(void *data)
{
- struct super_block *sb = ((struct mmpd_data *) data)->sb;
- struct buffer_head *bh = ((struct mmpd_data *) data)->bh;
+ struct super_block *sb = (struct super_block *) data;
struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+ struct buffer_head *bh = EXT4_SB(sb)->s_mmp_bh;
struct mmp_struct *mmp;
ext4_fsblk_t mmp_block;
u32 seq = 0;
@@ -245,12 +245,18 @@ static int kmmpd(void *data)
retval = write_mmp_block(sb, bh);
exit_thread:
- EXT4_SB(sb)->s_mmp_tsk = NULL;
- kfree(data);
- brelse(bh);
return retval;
}
+void ext4_stop_mmpd(struct ext4_sb_info *sbi)
+{
+ if (sbi->s_mmp_tsk) {
+ kthread_stop(sbi->s_mmp_tsk);
+ brelse(sbi->s_mmp_bh);
+ sbi->s_mmp_tsk = NULL;
+ }
+}
+
/*
* Get a random new sequence number but make sure it is not greater than
* EXT4_MMP_SEQ_MAX.
@@ -275,7 +281,6 @@ int ext4_multi_mount_protect(struct super_block *sb,
struct ext4_super_block *es = EXT4_SB(sb)->s_es;
struct buffer_head *bh = NULL;
struct mmp_struct *mmp = NULL;
- struct mmpd_data *mmpd_data;
u32 seq;
unsigned int mmp_check_interval = le16_to_cpu(es->s_mmp_update_interval);
unsigned int wait_time = 0;
@@ -364,24 +369,17 @@ skip:
goto failed;
}
- mmpd_data = kmalloc(sizeof(*mmpd_data), GFP_KERNEL);
- if (!mmpd_data) {
- ext4_warning(sb, "not enough memory for mmpd_data");
- goto failed;
- }
- mmpd_data->sb = sb;
- mmpd_data->bh = bh;
+ EXT4_SB(sb)->s_mmp_bh = bh;
/*
* Start a kernel thread to update the MMP block periodically.
*/
- EXT4_SB(sb)->s_mmp_tsk = kthread_run(kmmpd, mmpd_data, "kmmpd-%.*s",
+ EXT4_SB(sb)->s_mmp_tsk = kthread_run(kmmpd, sb, "kmmpd-%.*s",
(int)sizeof(mmp->mmp_bdevname),
bdevname(bh->b_bdev,
mmp->mmp_bdevname));
if (IS_ERR(EXT4_SB(sb)->s_mmp_tsk)) {
EXT4_SB(sb)->s_mmp_tsk = NULL;
- kfree(mmpd_data);
ext4_warning(sb, "Unable to create kmmpd thread for %s.",
sb->s_id);
goto failed;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index afb9d05a99ba..5fd56f616cf0 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1376,7 +1376,8 @@ int ext4_fname_setup_ci_filename(struct inode *dir, const struct qstr *iname,
struct dx_hash_info *hinfo = &name->hinfo;
int len;
- if (!IS_CASEFOLDED(dir) || !dir->i_sb->s_encoding) {
+ if (!IS_CASEFOLDED(dir) || !dir->i_sb->s_encoding ||
+ (IS_ENCRYPTED(dir) && !fscrypt_has_encryption_key(dir))) {
cf_name->name = NULL;
return 0;
}
@@ -1427,7 +1428,8 @@ static bool ext4_match(struct inode *parent,
#endif
#ifdef CONFIG_UNICODE
- if (parent->i_sb->s_encoding && IS_CASEFOLDED(parent)) {
+ if (parent->i_sb->s_encoding && IS_CASEFOLDED(parent) &&
+ (!IS_ENCRYPTED(parent) || fscrypt_has_encryption_key(parent))) {
if (fname->cf_name.name) {
struct qstr cf = {.name = fname->cf_name.name,
.len = fname->cf_name.len};
@@ -2497,7 +2499,7 @@ again:
/* Which index block gets the new entry? */
if (at - entries >= icount1) {
- frame->at = at = at - entries - icount1 + entries2;
+ frame->at = at - entries - icount1 + entries2;
frame->entries = entries = entries2;
swap(frame->bh, bh2);
}
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index bd0d185654f3..fc885914c88a 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -74,6 +74,15 @@ int ext4_resize_begin(struct super_block *sb)
return -EPERM;
}
+ if (ext4_has_feature_bigalloc(sb)) {
+ ext4_msg(sb, KERN_ERR, "Online resizing not supported with bigalloc");
+ return -EOPNOTSUPP;
+ }
+ if (ext4_has_feature_sparse_super2(sb)) {
+ ext4_msg(sb, KERN_ERR, "Online resizing not supported with sparse_super2");
+ return -EOPNOTSUPP;
+ }
+
if (test_and_set_bit_lock(EXT4_FLAGS_RESIZING,
&EXT4_SB(sb)->s_ext4_flags))
ret = -EBUSY;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 7dc94f3e18e6..20344633bdd9 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -718,6 +718,7 @@ static void flush_stashed_error_work(struct work_struct *work)
goto write_directly;
}
jbd2_journal_stop(handle);
+ ext4_notify_error_sysfs(sbi);
return;
}
write_directly:
@@ -726,6 +727,7 @@ write_directly:
* out and hope for the best.
*/
ext4_commit_super(sbi->s_sb);
+ ext4_notify_error_sysfs(sbi);
}
#define ext4_error_ratelimit(sb) \
@@ -1174,6 +1176,7 @@ static void ext4_put_super(struct super_block *sb)
ext4_unregister_sysfs(sb);
if (sbi->s_journal) {
+ jbd2_journal_unregister_shrinker(sbi->s_journal);
aborted = is_journal_aborted(sbi->s_journal);
err = jbd2_journal_destroy(sbi->s_journal);
sbi->s_journal = NULL;
@@ -1245,8 +1248,8 @@ static void ext4_put_super(struct super_block *sb)
ext4_xattr_destroy_cache(sbi->s_ea_block_cache);
sbi->s_ea_block_cache = NULL;
- if (sbi->s_mmp_tsk)
- kthread_stop(sbi->s_mmp_tsk);
+ ext4_stop_mmpd(sbi);
+
brelse(sbi->s_sbh);
sb->s_fs_info = NULL;
/*
@@ -1441,26 +1444,6 @@ static int ext4_nfs_commit_metadata(struct inode *inode)
return ext4_write_inode(inode, &wbc);
}
-/*
- * Try to release metadata pages (indirect blocks, directories) which are
- * mapped via the block device. Since these pages could have journal heads
- * which would prevent try_to_free_buffers() from freeing them, we must use
- * jbd2 layer's try_to_free_buffers() function to release them.
- */
-static int bdev_try_to_free_page(struct super_block *sb, struct page *page,
- gfp_t wait)
-{
- journal_t *journal = EXT4_SB(sb)->s_journal;
-
- WARN_ON(PageChecked(page));
- if (!page_has_buffers(page))
- return 0;
- if (journal)
- return jbd2_journal_try_to_free_buffers(journal, page);
-
- return try_to_free_buffers(page);
-}
-
#ifdef CONFIG_FS_ENCRYPTION
static int ext4_get_context(struct inode *inode, void *ctx, size_t len)
{
@@ -1655,7 +1638,6 @@ static const struct super_operations ext4_sops = {
.quota_write = ext4_quota_write,
.get_dquots = ext4_get_dquots,
#endif
- .bdev_try_to_free_page = bdev_try_to_free_page,
};
static const struct export_operations ext4_export_ops = {
@@ -3101,8 +3083,15 @@ static void ext4_orphan_cleanup(struct super_block *sb,
inode_lock(inode);
truncate_inode_pages(inode->i_mapping, inode->i_size);
ret = ext4_truncate(inode);
- if (ret)
+ if (ret) {
+ /*
+ * We need to clean up the in-core orphan list
+ * manually if ext4_truncate() failed to get a
+ * transaction handle.
+ */
+ ext4_orphan_del(NULL, inode);
ext4_std_error(inode->i_sb, ret);
+ }
inode_unlock(inode);
nr_truncates++;
} else {
@@ -4462,14 +4451,20 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
}
if (sb->s_blocksize != blocksize) {
+ /*
+ * bh must be released before kill_bdev(), otherwise
+ * it won't be freed and its page also. kill_bdev()
+ * is called by sb_set_blocksize().
+ */
+ brelse(bh);
/* Validate the filesystem blocksize */
if (!sb_set_blocksize(sb, blocksize)) {
ext4_msg(sb, KERN_ERR, "bad block size %d",
blocksize);
+ bh = NULL;
goto failed_mount;
}
- brelse(bh);
logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE;
offset = do_div(logical_sb_block, blocksize);
bh = ext4_sb_bread_unmovable(sb, logical_sb_block);
@@ -5052,6 +5047,7 @@ no_journal:
ext4_msg(sb, KERN_ERR,
"unable to initialize "
"flex_bg meta info!");
+ ret = -ENOMEM;
goto failed_mount6;
}
@@ -5172,6 +5168,7 @@ failed_mount_wq:
sbi->s_ea_block_cache = NULL;
if (sbi->s_journal) {
+ jbd2_journal_unregister_shrinker(sbi->s_journal);
jbd2_journal_destroy(sbi->s_journal);
sbi->s_journal = NULL;
}
@@ -5180,8 +5177,7 @@ failed_mount3a:
failed_mount3:
flush_work(&sbi->s_error_work);
del_timer_sync(&sbi->s_err_report);
- if (sbi->s_mmp_tsk)
- kthread_stop(sbi->s_mmp_tsk);
+ ext4_stop_mmpd(sbi);
failed_mount2:
rcu_read_lock();
group_desc = rcu_dereference(sbi->s_group_desc);
@@ -5202,8 +5198,9 @@ failed_mount:
kfree(get_qf_name(sb, sbi, i));
#endif
fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy);
- ext4_blkdev_remove(sbi);
+ /* ext4_blkdev_remove() calls kill_bdev(), release bh before it. */
brelse(bh);
+ ext4_blkdev_remove(sbi);
out_fail:
sb->s_fs_info = NULL;
kfree(sbi->s_blockgroup_lock);
@@ -5497,6 +5494,12 @@ static int ext4_load_journal(struct super_block *sb,
ext4_commit_super(sb);
}
+ err = jbd2_journal_register_shrinker(journal);
+ if (err) {
+ EXT4_SB(sb)->s_journal = NULL;
+ goto err_out;
+ }
+
return 0;
err_out:
@@ -5639,7 +5642,7 @@ static int ext4_mark_recovery_complete(struct super_block *sb,
return 0;
}
jbd2_journal_lock_updates(journal);
- err = jbd2_journal_flush(journal);
+ err = jbd2_journal_flush(journal, 0);
if (err < 0)
goto out;
@@ -5781,7 +5784,7 @@ static int ext4_freeze(struct super_block *sb)
* Don't clear the needs_recovery flag if we failed to
* flush the journal.
*/
- error = jbd2_journal_flush(journal);
+ error = jbd2_journal_flush(journal, 0);
if (error < 0)
goto out;
@@ -5982,8 +5985,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
*/
ext4_mark_recovery_complete(sb, es);
}
- if (sbi->s_mmp_tsk)
- kthread_stop(sbi->s_mmp_tsk);
+ ext4_stop_mmpd(sbi);
} else {
/* Make sure we can mount this feature set readwrite */
if (ext4_has_feature_readonly(sb) ||
@@ -6376,7 +6378,7 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
* otherwise be livelocked...
*/
jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
- err = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
+ err = jbd2_journal_flush(EXT4_SB(sb)->s_journal, 0);
jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
if (err)
return err;
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index 6f825dedc3d4..2314f7446592 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -315,7 +315,9 @@ EXT4_ATTR_FEATURE(verity);
#endif
EXT4_ATTR_FEATURE(metadata_csum_seed);
EXT4_ATTR_FEATURE(fast_commit);
+#if defined(CONFIG_UNICODE) && defined(CONFIG_FS_ENCRYPTION)
EXT4_ATTR_FEATURE(encrypted_casefold);
+#endif
static struct attribute *ext4_feat_attrs[] = {
ATTR_LIST(lazy_itable_init),
@@ -333,7 +335,9 @@ static struct attribute *ext4_feat_attrs[] = {
#endif
ATTR_LIST(metadata_csum_seed),
ATTR_LIST(fast_commit),
+#if defined(CONFIG_UNICODE) && defined(CONFIG_FS_ENCRYPTION)
ATTR_LIST(encrypted_casefold),
+#endif
NULL,
};
ATTRIBUTE_GROUPS(ext4_feat);
@@ -502,6 +506,11 @@ static struct kobj_type ext4_feat_ktype = {
.release = (void (*)(struct kobject *))kfree,
};
+void ext4_notify_error_sysfs(struct ext4_sb_info *sbi)
+{
+ sysfs_notify(&sbi->s_kobj, NULL, "errors_count");
+}
+
static struct kobject *ext4_root;
static struct kobject *ext4_feat;
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 10ba4b24a0aa..6dd5c05c444a 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -2190,31 +2190,7 @@ int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
return 0;
}
-int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode,
- struct ext4_xattr_info *i,
- struct ext4_xattr_ibody_find *is)
-{
- struct ext4_xattr_ibody_header *header;
- struct ext4_xattr_search *s = &is->s;
- int error;
-
- if (EXT4_I(inode)->i_extra_isize == 0)
- return -ENOSPC;
- error = ext4_xattr_set_entry(i, s, handle, inode, false /* is_block */);
- if (error)
- return error;
- header = IHDR(inode, ext4_raw_inode(&is->iloc));
- if (!IS_LAST_ENTRY(s->first)) {
- header->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC);
- ext4_set_inode_state(inode, EXT4_STATE_XATTR);
- } else {
- header->h_magic = cpu_to_le32(0);
- ext4_clear_inode_state(inode, EXT4_STATE_XATTR);
- }
- return 0;
-}
-
-static int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
+int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
struct ext4_xattr_info *i,
struct ext4_xattr_ibody_find *is)
{
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index 730b91fa0dd7..77efb9a627ad 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -186,9 +186,9 @@ extern int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
extern int ext4_xattr_ibody_get(struct inode *inode, int name_index,
const char *name,
void *buffer, size_t buffer_size);
-extern int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode,
- struct ext4_xattr_info *i,
- struct ext4_xattr_ibody_find *is);
+extern int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
+ struct ext4_xattr_info *i,
+ struct ext4_xattr_ibody_find *is);
extern struct mb_cache *ext4_xattr_create_cache(void);
extern void ext4_xattr_destroy_cache(struct mb_cache *);
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index bab9b202b496..de0c9b013a85 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -342,6 +342,7 @@ int fat_block_truncate_page(struct inode *inode, loff_t from)
}
static const struct address_space_operations fat_aops = {
+ .set_page_dirty = __set_page_dirty_buffers,
.readpage = fat_readpage,
.readahead = fat_readahead,
.writepage = fat_writepage,
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index e91980f49388..06d04a74ab6c 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -131,25 +131,6 @@ static bool inode_io_list_move_locked(struct inode *inode,
return false;
}
-/**
- * inode_io_list_del_locked - remove an inode from its bdi_writeback IO list
- * @inode: inode to be removed
- * @wb: bdi_writeback @inode is being removed from
- *
- * Remove @inode which may be on one of @wb->b_{dirty|io|more_io} lists and
- * clear %WB_has_dirty_io if all are empty afterwards.
- */
-static void inode_io_list_del_locked(struct inode *inode,
- struct bdi_writeback *wb)
-{
- assert_spin_locked(&wb->list_lock);
- assert_spin_locked(&inode->i_lock);
-
- inode->i_state &= ~I_SYNC_QUEUED;
- list_del_init(&inode->i_io_list);
- wb_io_lists_depopulated(wb);
-}
-
static void wb_wakeup(struct bdi_writeback *wb)
{
spin_lock_bh(&wb->work_lock);
@@ -244,6 +225,13 @@ void wb_wait_for_completion(struct wb_completion *done)
/* one round can affect upto 5 slots */
#define WB_FRN_MAX_IN_FLIGHT 1024 /* don't queue too many concurrently */
+/*
+ * Maximum inodes per isw. A specific value has been chosen to make
+ * struct inode_switch_wbs_context fit into 1024 bytes kmalloc.
+ */
+#define WB_MAX_INODES_PER_ISW ((1024UL - sizeof(struct inode_switch_wbs_context)) \
+ / sizeof(struct inode *))
+
static atomic_t isw_nr_in_flight = ATOMIC_INIT(0);
static struct workqueue_struct *isw_wq;
@@ -279,6 +267,28 @@ void __inode_attach_wb(struct inode *inode, struct page *page)
EXPORT_SYMBOL_GPL(__inode_attach_wb);
/**
+ * inode_cgwb_move_to_attached - put the inode onto wb->b_attached list
+ * @inode: inode of interest with i_lock held
+ * @wb: target bdi_writeback
+ *
+ * Remove the inode from wb's io lists and if necessarily put onto b_attached
+ * list. Only inodes attached to cgwb's are kept on this list.
+ */
+static void inode_cgwb_move_to_attached(struct inode *inode,
+ struct bdi_writeback *wb)
+{
+ assert_spin_locked(&wb->list_lock);
+ assert_spin_locked(&inode->i_lock);
+
+ inode->i_state &= ~I_SYNC_QUEUED;
+ if (wb != &wb->bdi->wb)
+ list_move(&inode->i_io_list, &wb->b_attached);
+ else
+ list_del_init(&inode->i_io_list);
+ wb_io_lists_depopulated(wb);
+}
+
+/**
* locked_inode_to_wb_and_lock_list - determine a locked inode's wb and lock it
* @inode: inode of interest with i_lock held
*
@@ -332,11 +342,18 @@ static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode)
}
struct inode_switch_wbs_context {
- struct inode *inode;
- struct bdi_writeback *new_wb;
+ struct rcu_work work;
- struct rcu_head rcu_head;
- struct work_struct work;
+ /*
+ * Multiple inodes can be switched at once. The switching procedure
+ * consists of two parts, separated by a RCU grace period. To make
+ * sure that the second part is executed for each inode gone through
+ * the first part, all inode pointers are placed into a NULL-terminated
+ * array embedded into struct inode_switch_wbs_context. Otherwise
+ * an inode could be left in a non-consistent state.
+ */
+ struct bdi_writeback *new_wb;
+ struct inode *inodes[];
};
static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi)
@@ -349,50 +366,23 @@ static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi)
up_write(&bdi->wb_switch_rwsem);
}
-static void inode_switch_wbs_work_fn(struct work_struct *work)
+static bool inode_do_switch_wbs(struct inode *inode,
+ struct bdi_writeback *old_wb,
+ struct bdi_writeback *new_wb)
{
- struct inode_switch_wbs_context *isw =
- container_of(work, struct inode_switch_wbs_context, work);
- struct inode *inode = isw->inode;
- struct backing_dev_info *bdi = inode_to_bdi(inode);
struct address_space *mapping = inode->i_mapping;
- struct bdi_writeback *old_wb = inode->i_wb;
- struct bdi_writeback *new_wb = isw->new_wb;
XA_STATE(xas, &mapping->i_pages, 0);
struct page *page;
bool switched = false;
- /*
- * If @inode switches cgwb membership while sync_inodes_sb() is
- * being issued, sync_inodes_sb() might miss it. Synchronize.
- */
- down_read(&bdi->wb_switch_rwsem);
-
- /*
- * By the time control reaches here, RCU grace period has passed
- * since I_WB_SWITCH assertion and all wb stat update transactions
- * between unlocked_inode_to_wb_begin/end() are guaranteed to be
- * synchronizing against the i_pages lock.
- *
- * Grabbing old_wb->list_lock, inode->i_lock and the i_pages lock
- * gives us exclusion against all wb related operations on @inode
- * including IO list manipulations and stat updates.
- */
- if (old_wb < new_wb) {
- spin_lock(&old_wb->list_lock);
- spin_lock_nested(&new_wb->list_lock, SINGLE_DEPTH_NESTING);
- } else {
- spin_lock(&new_wb->list_lock);
- spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING);
- }
spin_lock(&inode->i_lock);
xa_lock_irq(&mapping->i_pages);
/*
- * Once I_FREEING is visible under i_lock, the eviction path owns
- * the inode and we shouldn't modify ->i_io_list.
+ * Once I_FREEING or I_WILL_FREE are visible under i_lock, the eviction
+ * path owns the inode and we shouldn't modify ->i_io_list.
*/
- if (unlikely(inode->i_state & I_FREEING))
+ if (unlikely(inode->i_state & (I_FREEING | I_WILL_FREE)))
goto skip_switch;
trace_inode_switch_wbs(inode, old_wb, new_wb);
@@ -419,21 +409,28 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
wb_get(new_wb);
/*
- * Transfer to @new_wb's IO list if necessary. The specific list
- * @inode was on is ignored and the inode is put on ->b_dirty which
- * is always correct including from ->b_dirty_time. The transfer
- * preserves @inode->dirtied_when ordering.
+ * Transfer to @new_wb's IO list if necessary. If the @inode is dirty,
+ * the specific list @inode was on is ignored and the @inode is put on
+ * ->b_dirty which is always correct including from ->b_dirty_time.
+ * The transfer preserves @inode->dirtied_when ordering. If the @inode
+ * was clean, it means it was on the b_attached list, so move it onto
+ * the b_attached list of @new_wb.
*/
if (!list_empty(&inode->i_io_list)) {
- struct inode *pos;
-
- inode_io_list_del_locked(inode, old_wb);
inode->i_wb = new_wb;
- list_for_each_entry(pos, &new_wb->b_dirty, i_io_list)
- if (time_after_eq(inode->dirtied_when,
- pos->dirtied_when))
- break;
- inode_io_list_move_locked(inode, new_wb, pos->i_io_list.prev);
+
+ if (inode->i_state & I_DIRTY_ALL) {
+ struct inode *pos;
+
+ list_for_each_entry(pos, &new_wb->b_dirty, i_io_list)
+ if (time_after_eq(inode->dirtied_when,
+ pos->dirtied_when))
+ break;
+ inode_io_list_move_locked(inode, new_wb,
+ pos->i_io_list.prev);
+ } else {
+ inode_cgwb_move_to_attached(inode, new_wb);
+ }
} else {
inode->i_wb = new_wb;
}
@@ -452,31 +449,91 @@ skip_switch:
xa_unlock_irq(&mapping->i_pages);
spin_unlock(&inode->i_lock);
+
+ return switched;
+}
+
+static void inode_switch_wbs_work_fn(struct work_struct *work)
+{
+ struct inode_switch_wbs_context *isw =
+ container_of(to_rcu_work(work), struct inode_switch_wbs_context, work);
+ struct backing_dev_info *bdi = inode_to_bdi(isw->inodes[0]);
+ struct bdi_writeback *old_wb = isw->inodes[0]->i_wb;
+ struct bdi_writeback *new_wb = isw->new_wb;
+ unsigned long nr_switched = 0;
+ struct inode **inodep;
+
+ /*
+ * If @inode switches cgwb membership while sync_inodes_sb() is
+ * being issued, sync_inodes_sb() might miss it. Synchronize.
+ */
+ down_read(&bdi->wb_switch_rwsem);
+
+ /*
+ * By the time control reaches here, RCU grace period has passed
+ * since I_WB_SWITCH assertion and all wb stat update transactions
+ * between unlocked_inode_to_wb_begin/end() are guaranteed to be
+ * synchronizing against the i_pages lock.
+ *
+ * Grabbing old_wb->list_lock, inode->i_lock and the i_pages lock
+ * gives us exclusion against all wb related operations on @inode
+ * including IO list manipulations and stat updates.
+ */
+ if (old_wb < new_wb) {
+ spin_lock(&old_wb->list_lock);
+ spin_lock_nested(&new_wb->list_lock, SINGLE_DEPTH_NESTING);
+ } else {
+ spin_lock(&new_wb->list_lock);
+ spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING);
+ }
+
+ for (inodep = isw->inodes; *inodep; inodep++) {
+ WARN_ON_ONCE((*inodep)->i_wb != old_wb);
+ if (inode_do_switch_wbs(*inodep, old_wb, new_wb))
+ nr_switched++;
+ }
+
spin_unlock(&new_wb->list_lock);
spin_unlock(&old_wb->list_lock);
up_read(&bdi->wb_switch_rwsem);
- if (switched) {
+ if (nr_switched) {
wb_wakeup(new_wb);
- wb_put(old_wb);
+ wb_put_many(old_wb, nr_switched);
}
- wb_put(new_wb);
- iput(inode);
+ for (inodep = isw->inodes; *inodep; inodep++)
+ iput(*inodep);
+ wb_put(new_wb);
kfree(isw);
-
atomic_dec(&isw_nr_in_flight);
}
-static void inode_switch_wbs_rcu_fn(struct rcu_head *rcu_head)
+static bool inode_prepare_wbs_switch(struct inode *inode,
+ struct bdi_writeback *new_wb)
{
- struct inode_switch_wbs_context *isw = container_of(rcu_head,
- struct inode_switch_wbs_context, rcu_head);
+ /*
+ * Paired with smp_mb() in cgroup_writeback_umount().
+ * isw_nr_in_flight must be increased before checking SB_ACTIVE and
+ * grabbing an inode, otherwise isw_nr_in_flight can be observed as 0
+ * in cgroup_writeback_umount() and the isw_wq will be not flushed.
+ */
+ smp_mb();
- /* needs to grab bh-unsafe locks, bounce to work item */
- INIT_WORK(&isw->work, inode_switch_wbs_work_fn);
- queue_work(isw_wq, &isw->work);
+ /* while holding I_WB_SWITCH, no one else can update the association */
+ spin_lock(&inode->i_lock);
+ if (!(inode->i_sb->s_flags & SB_ACTIVE) ||
+ inode->i_state & (I_WB_SWITCH | I_FREEING | I_WILL_FREE) ||
+ inode_to_wb(inode) == new_wb) {
+ spin_unlock(&inode->i_lock);
+ return false;
+ }
+ inode->i_state |= I_WB_SWITCH;
+ __iget(inode);
+ spin_unlock(&inode->i_lock);
+
+ return true;
}
/**
@@ -501,32 +558,30 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
if (atomic_read(&isw_nr_in_flight) > WB_FRN_MAX_IN_FLIGHT)
return;
- isw = kzalloc(sizeof(*isw), GFP_ATOMIC);
+ isw = kzalloc(sizeof(*isw) + 2 * sizeof(struct inode *), GFP_ATOMIC);
if (!isw)
return;
+ atomic_inc(&isw_nr_in_flight);
+
/* find and pin the new wb */
rcu_read_lock();
memcg_css = css_from_id(new_wb_id, &memory_cgrp_subsys);
- if (memcg_css)
- isw->new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
+ if (memcg_css && !css_tryget(memcg_css))
+ memcg_css = NULL;
rcu_read_unlock();
+ if (!memcg_css)
+ goto out_free;
+
+ isw->new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
+ css_put(memcg_css);
if (!isw->new_wb)
goto out_free;
- /* while holding I_WB_SWITCH, no one else can update the association */
- spin_lock(&inode->i_lock);
- if (!(inode->i_sb->s_flags & SB_ACTIVE) ||
- inode->i_state & (I_WB_SWITCH | I_FREEING) ||
- inode_to_wb(inode) == isw->new_wb) {
- spin_unlock(&inode->i_lock);
+ if (!inode_prepare_wbs_switch(inode, isw->new_wb))
goto out_free;
- }
- inode->i_state |= I_WB_SWITCH;
- __iget(inode);
- spin_unlock(&inode->i_lock);
- isw->inode = inode;
+ isw->inodes[0] = inode;
/*
* In addition to synchronizing among switchers, I_WB_SWITCH tells
@@ -534,18 +589,85 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
* lock so that stat transfer can synchronize against them.
* Let's continue after I_WB_SWITCH is guaranteed to be visible.
*/
- call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn);
-
- atomic_inc(&isw_nr_in_flight);
+ INIT_RCU_WORK(&isw->work, inode_switch_wbs_work_fn);
+ queue_rcu_work(isw_wq, &isw->work);
return;
out_free:
+ atomic_dec(&isw_nr_in_flight);
if (isw->new_wb)
wb_put(isw->new_wb);
kfree(isw);
}
/**
+ * cleanup_offline_cgwb - detach associated inodes
+ * @wb: target wb
+ *
+ * Switch all inodes attached to @wb to a nearest living ancestor's wb in order
+ * to eventually release the dying @wb. Returns %true if not all inodes were
+ * switched and the function has to be restarted.
+ */
+bool cleanup_offline_cgwb(struct bdi_writeback *wb)
+{
+ struct cgroup_subsys_state *memcg_css;
+ struct inode_switch_wbs_context *isw;
+ struct inode *inode;
+ int nr;
+ bool restart = false;
+
+ isw = kzalloc(sizeof(*isw) + WB_MAX_INODES_PER_ISW *
+ sizeof(struct inode *), GFP_KERNEL);
+ if (!isw)
+ return restart;
+
+ atomic_inc(&isw_nr_in_flight);
+
+ for (memcg_css = wb->memcg_css->parent; memcg_css;
+ memcg_css = memcg_css->parent) {
+ isw->new_wb = wb_get_create(wb->bdi, memcg_css, GFP_KERNEL);
+ if (isw->new_wb)
+ break;
+ }
+ if (unlikely(!isw->new_wb))
+ isw->new_wb = &wb->bdi->wb; /* wb_get() is noop for bdi's wb */
+
+ nr = 0;
+ spin_lock(&wb->list_lock);
+ list_for_each_entry(inode, &wb->b_attached, i_io_list) {
+ if (!inode_prepare_wbs_switch(inode, isw->new_wb))
+ continue;
+
+ isw->inodes[nr++] = inode;
+
+ if (nr >= WB_MAX_INODES_PER_ISW - 1) {
+ restart = true;
+ break;
+ }
+ }
+ spin_unlock(&wb->list_lock);
+
+ /* no attached inodes? bail out */
+ if (nr == 0) {
+ atomic_dec(&isw_nr_in_flight);
+ wb_put(isw->new_wb);
+ kfree(isw);
+ return restart;
+ }
+
+ /*
+ * In addition to synchronizing among switchers, I_WB_SWITCH tells
+ * the RCU protected stat update paths to grab the i_page
+ * lock so that stat transfer can synchronize against them.
+ * Let's continue after I_WB_SWITCH is guaranteed to be visible.
+ */
+ INIT_RCU_WORK(&isw->work, inode_switch_wbs_work_fn);
+ queue_rcu_work(isw_wq, &isw->work);
+
+ return restart;
+}
+
+/**
* wbc_attach_and_unlock_inode - associate wbc with target inode and unlock it
* @wbc: writeback_control of interest
* @inode: target inode
@@ -1000,6 +1122,12 @@ out_bdi_put:
*/
void cgroup_writeback_umount(void)
{
+ /*
+ * SB_ACTIVE should be reliably cleared before checking
+ * isw_nr_in_flight, see generic_shutdown_super().
+ */
+ smp_mb();
+
if (atomic_read(&isw_nr_in_flight)) {
/*
* Use rcu_barrier() to wait for all pending callbacks to
@@ -1024,6 +1152,17 @@ fs_initcall(cgroup_writeback_init);
static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi) { }
static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi) { }
+static void inode_cgwb_move_to_attached(struct inode *inode,
+ struct bdi_writeback *wb)
+{
+ assert_spin_locked(&wb->list_lock);
+ assert_spin_locked(&inode->i_lock);
+
+ inode->i_state &= ~I_SYNC_QUEUED;
+ list_del_init(&inode->i_io_list);
+ wb_io_lists_depopulated(wb);
+}
+
static struct bdi_writeback *
locked_inode_to_wb_and_lock_list(struct inode *inode)
__releases(&inode->i_lock)
@@ -1124,7 +1263,11 @@ void inode_io_list_del(struct inode *inode)
wb = inode_to_wb_and_lock_list(inode);
spin_lock(&inode->i_lock);
- inode_io_list_del_locked(inode, wb);
+
+ inode->i_state &= ~I_SYNC_QUEUED;
+ list_del_init(&inode->i_io_list);
+ wb_io_lists_depopulated(wb);
+
spin_unlock(&inode->i_lock);
spin_unlock(&wb->list_lock);
}
@@ -1437,7 +1580,7 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
inode->i_state &= ~I_SYNC_QUEUED;
} else {
/* The inode is clean. Remove from writeback lists. */
- inode_io_list_del_locked(inode, wb);
+ inode_cgwb_move_to_attached(inode, wb);
}
}
@@ -1589,7 +1732,7 @@ static int writeback_single_inode(struct inode *inode,
* responsible for the writeback lists.
*/
if (!(inode->i_state & I_DIRTY_ALL))
- inode_io_list_del_locked(inode, wb);
+ inode_cgwb_move_to_attached(inode, wb);
spin_unlock(&wb->list_lock);
inode_sync_complete(inode);
out:
@@ -2205,28 +2348,6 @@ int dirtytime_interval_handler(struct ctl_table *table, int write,
return ret;
}
-static noinline void block_dump___mark_inode_dirty(struct inode *inode)
-{
- if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) {
- struct dentry *dentry;
- const char *name = "?";
-
- dentry = d_find_alias(inode);
- if (dentry) {
- spin_lock(&dentry->d_lock);
- name = (const char *) dentry->d_name.name;
- }
- printk(KERN_DEBUG
- "%s(%d): dirtied inode %lu (%s) on %s\n",
- current->comm, task_pid_nr(current), inode->i_ino,
- name, inode->i_sb->s_id);
- if (dentry) {
- spin_unlock(&dentry->d_lock);
- dput(dentry);
- }
- }
-}
-
/**
* __mark_inode_dirty - internal function to mark an inode dirty
*
@@ -2296,9 +2417,6 @@ void __mark_inode_dirty(struct inode *inode, int flags)
(dirtytime && (inode->i_state & I_DIRTY_INODE)))
return;
- if (unlikely(block_dump))
- block_dump___mark_inode_dirty(inode);
-
spin_lock(&inode->i_lock);
if (dirtytime && (inode->i_state & I_DIRTY_INODE))
goto out_unlock_inode;
diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c
index ff99ab2a3c43..fb733eb5aead 100644
--- a/fs/fuse/dax.c
+++ b/fs/fuse/dax.c
@@ -9,6 +9,7 @@
#include <linux/delay.h>
#include <linux/dax.h>
#include <linux/uio.h>
+#include <linux/pagemap.h>
#include <linux/pfn_t.h>
#include <linux/iomap.h>
#include <linux/interval_tree.h>
@@ -1329,7 +1330,7 @@ bool fuse_dax_inode_alloc(struct super_block *sb, struct fuse_inode *fi)
static const struct address_space_operations fuse_dax_file_aops = {
.writepages = fuse_dax_writepages,
.direct_IO = noop_direct_IO,
- .set_page_dirty = noop_set_page_dirty,
+ .set_page_dirty = __set_page_dirty_no_writeback,
.invalidatepage = noop_invalidatepage,
};
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 23b5be3db044..81d8f064126e 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -784,7 +784,7 @@ static const struct address_space_operations gfs2_aops = {
.writepages = gfs2_writepages,
.readpage = gfs2_readpage,
.readahead = gfs2_readahead,
- .set_page_dirty = iomap_set_page_dirty,
+ .set_page_dirty = __set_page_dirty_nobuffers,
.releasepage = iomap_releasepage,
.invalidatepage = iomap_invalidatepage,
.bmap = gfs2_bmap,
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 0bcf11a9987b..ed8b67b21718 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -56,14 +56,6 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
u64 block, struct page *page)
{
struct inode *inode = &ip->i_inode;
- int release = 0;
-
- if (!page || page->index) {
- page = find_or_create_page(inode->i_mapping, 0, GFP_NOFS);
- if (!page)
- return -ENOMEM;
- release = 1;
- }
if (!PageUptodate(page)) {
void *kaddr = kmap(page);
@@ -97,26 +89,10 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
gfs2_ordered_add_inode(ip);
}
- if (release) {
- unlock_page(page);
- put_page(page);
- }
-
return 0;
}
-/**
- * gfs2_unstuff_dinode - Unstuff a dinode when the data has grown too big
- * @ip: The GFS2 inode to unstuff
- * @page: The (optional) page. This is looked up if the @page is NULL
- *
- * This routine unstuffs a dinode and returns it to a "normal" state such
- * that the height can be grown in the traditional way.
- *
- * Returns: errno
- */
-
-int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
+static int __gfs2_unstuff_inode(struct gfs2_inode *ip, struct page *page)
{
struct buffer_head *bh, *dibh;
struct gfs2_dinode *di;
@@ -124,11 +100,9 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
int isdir = gfs2_is_dir(ip);
int error;
- down_write(&ip->i_rw_mutex);
-
error = gfs2_meta_inode_buffer(ip, &dibh);
if (error)
- goto out;
+ return error;
if (i_size_read(&ip->i_inode)) {
/* Get a free block, fill it with the stuffed data,
@@ -170,12 +144,38 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
out_brelse:
brelse(dibh);
+ return error;
+}
+
+/**
+ * gfs2_unstuff_dinode - Unstuff a dinode when the data has grown too big
+ * @ip: The GFS2 inode to unstuff
+ *
+ * This routine unstuffs a dinode and returns it to a "normal" state such
+ * that the height can be grown in the traditional way.
+ *
+ * Returns: errno
+ */
+
+int gfs2_unstuff_dinode(struct gfs2_inode *ip)
+{
+ struct inode *inode = &ip->i_inode;
+ struct page *page;
+ int error;
+
+ down_write(&ip->i_rw_mutex);
+ page = find_or_create_page(inode->i_mapping, 0, GFP_NOFS);
+ error = -ENOMEM;
+ if (!page)
+ goto out;
+ error = __gfs2_unstuff_inode(ip, page);
+ unlock_page(page);
+ put_page(page);
out:
up_write(&ip->i_rw_mutex);
return error;
}
-
/**
* find_metapath - Find path through the metadata tree
* @sdp: The superblock
@@ -1079,7 +1079,7 @@ static int gfs2_iomap_begin_write(struct inode *inode, loff_t pos,
goto out_trans_fail;
if (unstuff) {
- ret = gfs2_unstuff_dinode(ip, NULL);
+ ret = gfs2_unstuff_dinode(ip);
if (ret)
goto out_trans_end;
release_metapath(mp);
@@ -2143,7 +2143,7 @@ static int do_grow(struct inode *inode, u64 size)
goto do_grow_release;
if (unstuff) {
- error = gfs2_unstuff_dinode(ip, NULL);
+ error = gfs2_unstuff_dinode(ip);
if (error)
goto do_end_trans;
}
diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h
index 6676d863faef..53cce6c08e81 100644
--- a/fs/gfs2/bmap.h
+++ b/fs/gfs2/bmap.h
@@ -46,7 +46,7 @@ static inline void gfs2_write_calc_reserv(const struct gfs2_inode *ip,
extern const struct iomap_ops gfs2_iomap_ops;
extern const struct iomap_writeback_ops gfs2_writeback_ops;
-extern int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page);
+extern int gfs2_unstuff_dinode(struct gfs2_inode *ip);
extern int gfs2_block_map(struct inode *inode, sector_t lblock,
struct buffer_head *bh, int create);
extern int gfs2_iomap_get(struct inode *inode, loff_t pos, loff_t length,
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 18f67b37d6f8..42b7dfffb5e7 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -172,7 +172,7 @@ static int gfs2_dir_write_data(struct gfs2_inode *ip, const char *buf,
return -EINVAL;
if (gfs2_is_stuffed(ip)) {
- error = gfs2_unstuff_dinode(ip, NULL);
+ error = gfs2_unstuff_dinode(ip);
if (error)
return error;
}
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index a0b542d84cd9..84ec053d43b4 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -210,7 +210,7 @@ void gfs2_set_inode_flags(struct inode *inode)
/**
* do_gfs2_set_flags - set flags on an inode
- * @filp: file pointer
+ * @inode: The inode
* @reqflags: The flags to set
* @mask: Indicates which flags are valid
* @fsflags: The FS_* inode flags passed in
@@ -427,22 +427,25 @@ static vm_fault_t gfs2_page_mkwrite(struct vm_fault *vmf)
struct gfs2_alloc_parms ap = { .aflags = 0, };
u64 offset = page_offset(page);
unsigned int data_blocks, ind_blocks, rblocks;
+ vm_fault_t ret = VM_FAULT_LOCKED;
struct gfs2_holder gh;
unsigned int length;
loff_t size;
- int ret;
+ int err;
sb_start_pagefault(inode->i_sb);
gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
- ret = gfs2_glock_nq(&gh);
- if (ret)
+ err = gfs2_glock_nq(&gh);
+ if (err) {
+ ret = block_page_mkwrite_return(err);
goto out_uninit;
+ }
/* Check page index against inode size */
size = i_size_read(inode);
if (offset >= size) {
- ret = -EINVAL;
+ ret = VM_FAULT_SIGBUS;
goto out_unlock;
}
@@ -450,8 +453,8 @@ static vm_fault_t gfs2_page_mkwrite(struct vm_fault *vmf)
file_update_time(vmf->vma->vm_file);
/* page is wholly or partially inside EOF */
- if (offset > size - PAGE_SIZE)
- length = offset_in_page(size);
+ if (size - offset < PAGE_SIZE)
+ length = size - offset;
else
length = PAGE_SIZE;
@@ -469,24 +472,30 @@ static vm_fault_t gfs2_page_mkwrite(struct vm_fault *vmf)
!gfs2_write_alloc_required(ip, offset, length)) {
lock_page(page);
if (!PageUptodate(page) || page->mapping != inode->i_mapping) {
- ret = -EAGAIN;
+ ret = VM_FAULT_NOPAGE;
unlock_page(page);
}
goto out_unlock;
}
- ret = gfs2_rindex_update(sdp);
- if (ret)
+ err = gfs2_rindex_update(sdp);
+ if (err) {
+ ret = block_page_mkwrite_return(err);
goto out_unlock;
+ }
gfs2_write_calc_reserv(ip, length, &data_blocks, &ind_blocks);
ap.target = data_blocks + ind_blocks;
- ret = gfs2_quota_lock_check(ip, &ap);
- if (ret)
+ err = gfs2_quota_lock_check(ip, &ap);
+ if (err) {
+ ret = block_page_mkwrite_return(err);
goto out_unlock;
- ret = gfs2_inplace_reserve(ip, &ap);
- if (ret)
+ }
+ err = gfs2_inplace_reserve(ip, &ap);
+ if (err) {
+ ret = block_page_mkwrite_return(err);
goto out_quota_unlock;
+ }
rblocks = RES_DINODE + ind_blocks;
if (gfs2_is_jdata(ip))
@@ -495,28 +504,38 @@ static vm_fault_t gfs2_page_mkwrite(struct vm_fault *vmf)
rblocks += RES_STATFS + RES_QUOTA;
rblocks += gfs2_rg_blocks(ip, data_blocks + ind_blocks);
}
- ret = gfs2_trans_begin(sdp, rblocks, 0);
- if (ret)
+ err = gfs2_trans_begin(sdp, rblocks, 0);
+ if (err) {
+ ret = block_page_mkwrite_return(err);
goto out_trans_fail;
+ }
+
+ /* Unstuff, if required, and allocate backing blocks for page */
+ if (gfs2_is_stuffed(ip)) {
+ err = gfs2_unstuff_dinode(ip);
+ if (err) {
+ ret = block_page_mkwrite_return(err);
+ goto out_trans_end;
+ }
+ }
lock_page(page);
- ret = -EAGAIN;
/* If truncated, we must retry the operation, we may have raced
* with the glock demotion code.
*/
- if (!PageUptodate(page) || page->mapping != inode->i_mapping)
- goto out_trans_end;
+ if (!PageUptodate(page) || page->mapping != inode->i_mapping) {
+ ret = VM_FAULT_NOPAGE;
+ goto out_page_locked;
+ }
- /* Unstuff, if required, and allocate backing blocks for page */
- ret = 0;
- if (gfs2_is_stuffed(ip))
- ret = gfs2_unstuff_dinode(ip, page);
- if (ret == 0)
- ret = gfs2_allocate_page_backing(page, length);
+ err = gfs2_allocate_page_backing(page, length);
+ if (err)
+ ret = block_page_mkwrite_return(err);
-out_trans_end:
- if (ret)
+out_page_locked:
+ if (ret != VM_FAULT_LOCKED)
unlock_page(page);
+out_trans_end:
gfs2_trans_end(sdp);
out_trans_fail:
gfs2_inplace_release(ip);
@@ -526,12 +545,12 @@ out_unlock:
gfs2_glock_dq(&gh);
out_uninit:
gfs2_holder_uninit(&gh);
- if (ret == 0) {
+ if (ret == VM_FAULT_LOCKED) {
set_page_dirty(page);
wait_for_stable_page(page);
}
sb_end_pagefault(inode->i_sb);
- return block_page_mkwrite_return(ret);
+ return ret;
}
static vm_fault_t gfs2_fault(struct vm_fault *vmf)
@@ -911,8 +930,11 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
current->backing_dev_info = inode_to_bdi(inode);
buffered = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops);
current->backing_dev_info = NULL;
- if (unlikely(buffered <= 0))
+ if (unlikely(buffered <= 0)) {
+ if (!ret)
+ ret = buffered;
goto out_unlock;
+ }
/*
* We need to ensure that the page cache pages are written to
@@ -959,7 +981,7 @@ static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,
gfs2_trans_add_meta(ip->i_gl, dibh);
if (gfs2_is_stuffed(ip)) {
- error = gfs2_unstuff_dinode(ip, NULL);
+ error = gfs2_unstuff_dinode(ip);
if (unlikely(error))
goto out;
}
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index ea7fc5c641c7..1f3902ecdded 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -212,8 +212,7 @@ void gfs2_glock_add_to_lru(struct gfs2_glock *gl)
spin_lock(&lru_lock);
- list_del(&gl->gl_lru);
- list_add_tail(&gl->gl_lru, &lru_list);
+ list_move_tail(&gl->gl_lru, &lru_list);
if (!test_bit(GLF_LRU, &gl->gl_flags)) {
set_bit(GLF_LRU, &gl->gl_flags);
@@ -582,6 +581,16 @@ out_locked:
spin_unlock(&gl->gl_lockref.lock);
}
+static bool is_system_glock(struct gfs2_glock *gl)
+{
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
+ struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
+
+ if (gl == m_ip->i_gl)
+ return true;
+ return false;
+}
+
/**
* do_xmote - Calls the DLM to change the state of a lock
* @gl: The lock state
@@ -671,17 +680,25 @@ skip_inval:
* to see sd_log_error and withdraw, and in the meantime, requeue the
* work for later.
*
+ * We make a special exception for some system glocks, such as the
+ * system statfs inode glock, which needs to be granted before the
+ * gfs2_quotad daemon can exit, and that exit needs to finish before
+ * we can unmount the withdrawn file system.
+ *
* However, if we're just unlocking the lock (say, for unmount, when
* gfs2_gl_hash_clear calls clear_glock) and recovery is complete
* then it's okay to tell dlm to unlock it.
*/
if (unlikely(sdp->sd_log_error && !gfs2_withdrawn(sdp)))
gfs2_withdraw_delayed(sdp);
- if (glock_blocked_by_withdraw(gl)) {
- if (target != LM_ST_UNLOCKED ||
- test_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags)) {
+ if (glock_blocked_by_withdraw(gl) &&
+ (target != LM_ST_UNLOCKED ||
+ test_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags))) {
+ if (!is_system_glock(gl)) {
gfs2_glock_queue_work(gl, GL_GLOCK_DFT_HOLD);
goto out;
+ } else {
+ clear_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags);
}
}
@@ -1466,9 +1483,11 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
glock_blocked_by_withdraw(gl) &&
gh->gh_gl != sdp->sd_jinode_gl) {
sdp->sd_glock_dqs_held++;
+ spin_unlock(&gl->gl_lockref.lock);
might_sleep();
wait_on_bit(&sdp->sd_flags, SDF_WITHDRAW_RECOVERY,
TASK_UNINTERRUPTIBLE);
+ spin_lock(&gl->gl_lockref.lock);
}
if (gh->gh_flags & GL_NOCACHE)
handle_callback(gl, LM_ST_UNLOCKED, 0, false);
@@ -1775,6 +1794,7 @@ __acquires(&lru_lock)
while(!list_empty(list)) {
gl = list_first_entry(list, struct gfs2_glock, gl_lru);
list_del_init(&gl->gl_lru);
+ clear_bit(GLF_LRU, &gl->gl_flags);
if (!spin_trylock(&gl->gl_lockref.lock)) {
add_back_to_lru:
list_add(&gl->gl_lru, &lru_list);
@@ -1820,7 +1840,6 @@ static long gfs2_scan_glock_lru(int nr)
if (!test_bit(GLF_LOCK, &gl->gl_flags)) {
list_move(&gl->gl_lru, &dispose);
atomic_dec(&lru_count);
- clear_bit(GLF_LRU, &gl->gl_flags);
freed++;
continue;
}
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 454095e9fedf..54d3fbeb3002 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -396,7 +396,7 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
struct timespec64 atime;
u16 height, depth;
umode_t mode = be32_to_cpu(str->di_mode);
- bool is_new = ip->i_inode.i_flags & I_NEW;
+ bool is_new = ip->i_inode.i_state & I_NEW;
if (unlikely(ip->i_no_addr != be64_to_cpu(str->di_num.no_addr)))
goto corrupt;
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 97d54e581a7b..42c15cfc0821 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -926,10 +926,10 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags)
}
/**
- * ail_drain - drain the ail lists after a withdraw
+ * gfs2_ail_drain - drain the ail lists after a withdraw
* @sdp: Pointer to GFS2 superblock
*/
-static void ail_drain(struct gfs2_sbd *sdp)
+void gfs2_ail_drain(struct gfs2_sbd *sdp)
{
struct gfs2_trans *tr;
@@ -956,6 +956,7 @@ static void ail_drain(struct gfs2_sbd *sdp)
list_del(&tr->tr_list);
gfs2_trans_free(sdp, tr);
}
+ gfs2_drain_revokes(sdp);
spin_unlock(&sdp->sd_ail_lock);
}
@@ -1162,7 +1163,6 @@ out_withdraw:
if (tr && list_empty(&tr->tr_list))
list_add(&tr->tr_list, &sdp->sd_ail1_list);
spin_unlock(&sdp->sd_ail_lock);
- ail_drain(sdp); /* frees all transactions */
tr = NULL;
goto out_end;
}
diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h
index eea58015710e..fc905c2af53c 100644
--- a/fs/gfs2/log.h
+++ b/fs/gfs2/log.h
@@ -93,5 +93,6 @@ extern int gfs2_logd(void *data);
extern void gfs2_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd);
extern void gfs2_glock_remove_revoke(struct gfs2_glock *gl);
extern void gfs2_flush_revokes(struct gfs2_sbd *sdp);
+extern void gfs2_ail_drain(struct gfs2_sbd *sdp);
#endif /* __LOG_DOT_H__ */
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 221e7118cc3b..8ee05d25dfa6 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -885,7 +885,7 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
gfs2_log_write_page(sdp, page);
}
-static void revoke_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
+void gfs2_drain_revokes(struct gfs2_sbd *sdp)
{
struct list_head *head = &sdp->sd_log_revokes;
struct gfs2_bufdata *bd;
@@ -900,6 +900,11 @@ static void revoke_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
}
}
+static void revoke_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
+{
+ gfs2_drain_revokes(sdp);
+}
+
static void revoke_lo_before_scan(struct gfs2_jdesc *jd,
struct gfs2_log_header_host *head, int pass)
{
diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h
index 31b6dd0d2e5d..f707601597dc 100644
--- a/fs/gfs2/lops.h
+++ b/fs/gfs2/lops.h
@@ -20,6 +20,7 @@ extern void gfs2_log_submit_bio(struct bio **biop, int opf);
extern void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh);
extern int gfs2_find_jhead(struct gfs2_jdesc *jd,
struct gfs2_log_header_host *head, bool keep_cache);
+extern void gfs2_drain_revokes(struct gfs2_sbd *sdp);
static inline unsigned int buf_limit(struct gfs2_sbd *sdp)
{
return sdp->sd_ldptrs;
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index d68184ebbfdd..7c9619997355 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -89,11 +89,13 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb
}
const struct address_space_operations gfs2_meta_aops = {
+ .set_page_dirty = __set_page_dirty_buffers,
.writepage = gfs2_aspace_writepage,
.releasepage = gfs2_releasepage,
};
const struct address_space_operations gfs2_rgrp_aops = {
+ .set_page_dirty = __set_page_dirty_buffers,
.writepage = gfs2_aspace_writepage,
.releasepage = gfs2_releasepage,
};
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 826f77d9cff5..5f4504dd0875 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -687,6 +687,7 @@ static int init_statfs(struct gfs2_sbd *sdp)
}
iput(pn);
+ pn = NULL;
ip = GFS2_I(sdp->sd_sc_inode);
error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0,
&sdp->sd_sc_gh);
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 94637c307cc8..be0997e24d60 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -825,7 +825,7 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
u64 size;
if (gfs2_is_stuffed(ip)) {
- err = gfs2_unstuff_dinode(ip, NULL);
+ err = gfs2_unstuff_dinode(ip);
if (err)
return err;
}
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index 3e08027a6c81..f4325b44956d 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -131,6 +131,7 @@ static void signal_our_withdraw(struct gfs2_sbd *sdp)
if (test_bit(SDF_NORECOVERY, &sdp->sd_flags) || !sdp->sd_jdesc)
return;
+ gfs2_ail_drain(sdp); /* frees all transactions */
inode = sdp->sd_jdesc->jd_inode;
ip = GFS2_I(inode);
i_gl = ip->i_gl;
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 3fc5cb346586..4a95a92546a0 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -159,6 +159,7 @@ static int hfs_writepages(struct address_space *mapping,
}
const struct address_space_operations hfs_btree_aops = {
+ .set_page_dirty = __set_page_dirty_buffers,
.readpage = hfs_readpage,
.writepage = hfs_writepage,
.write_begin = hfs_write_begin,
@@ -168,6 +169,7 @@ const struct address_space_operations hfs_btree_aops = {
};
const struct address_space_operations hfs_aops = {
+ .set_page_dirty = __set_page_dirty_buffers,
.readpage = hfs_readpage,
.writepage = hfs_writepage,
.write_begin = hfs_write_begin,
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 8ea447e5c470..6fef67c2a9f0 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -156,6 +156,7 @@ static int hfsplus_writepages(struct address_space *mapping,
}
const struct address_space_operations hfsplus_btree_aops = {
+ .set_page_dirty = __set_page_dirty_buffers,
.readpage = hfsplus_readpage,
.writepage = hfsplus_writepage,
.write_begin = hfsplus_write_begin,
@@ -165,6 +166,7 @@ const struct address_space_operations hfsplus_btree_aops = {
};
const struct address_space_operations hfsplus_aops = {
+ .set_page_dirty = __set_page_dirty_buffers,
.readpage = hfsplus_readpage,
.writepage = hfsplus_writepage,
.write_begin = hfsplus_write_begin,
@@ -279,6 +281,11 @@ int hfsplus_getattr(struct user_namespace *mnt_userns, const struct path *path,
struct inode *inode = d_inode(path->dentry);
struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
+ if (request_mask & STATX_BTIME) {
+ stat->result_mask |= STATX_BTIME;
+ stat->btime = hfsp_mt2ut(hip->create_date);
+ }
+
if (inode->i_flags & S_APPEND)
stat->attributes |= STATX_ATTR_APPEND;
if (inode->i_flags & S_IMMUTABLE)
diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c
index 4d169c5a2673..e2855ceefd39 100644
--- a/fs/hfsplus/xattr.c
+++ b/fs/hfsplus/xattr.c
@@ -204,7 +204,6 @@ check_attr_tree_state_again:
buf = kzalloc(node_size, GFP_NOFS);
if (!buf) {
- pr_err("failed to allocate memory for header node\n");
err = -ENOMEM;
goto end_attr_file_creation;
}
diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c
index 077c25128eb7..c3a49aacf20a 100644
--- a/fs/hpfs/file.c
+++ b/fs/hpfs/file.c
@@ -196,6 +196,7 @@ static int hpfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
}
const struct address_space_operations hpfs_aops = {
+ .set_page_dirty = __set_page_dirty_buffers,
.readpage = hpfs_readpage,
.writepage = hpfs_writepage,
.readahead = hpfs_readahead,
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 9d9e0097c1d3..926eeb9bf4eb 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -529,7 +529,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
* the subpool and global reserve usage count can need
* to be adjusted.
*/
- VM_BUG_ON(PagePrivate(page));
+ VM_BUG_ON(HPageRestoreReserve(page));
remove_huge_page(page);
freed++;
if (!truncate_op) {
@@ -735,6 +735,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
__SetPageUptodate(page);
error = huge_add_to_page_cache(page, mapping, index);
if (unlikely(error)) {
+ restore_reserve_on_error(h, &pseudo_vma, addr, page);
put_page(page);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
goto out;
@@ -1445,7 +1446,7 @@ static int get_hstate_idx(int page_size_log)
* otherwise hugetlb_reserve_pages reserves one less hugepages than intended.
*/
struct file *hugetlb_file_setup(const char *name, size_t size,
- vm_flags_t acctflag, struct user_struct **user,
+ vm_flags_t acctflag, struct ucounts **ucounts,
int creat_flags, int page_size_log)
{
struct inode *inode;
@@ -1457,20 +1458,20 @@ struct file *hugetlb_file_setup(const char *name, size_t size,
if (hstate_idx < 0)
return ERR_PTR(-ENODEV);
- *user = NULL;
+ *ucounts = NULL;
mnt = hugetlbfs_vfsmount[hstate_idx];
if (!mnt)
return ERR_PTR(-ENOENT);
if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) {
- *user = current_user();
- if (user_shm_lock(size, *user)) {
+ *ucounts = current_ucounts();
+ if (user_shm_lock(size, *ucounts)) {
task_lock(current);
pr_warn_once("%s (%d): Using mlock ulimits for SHM_HUGETLB is deprecated\n",
current->comm, current->pid);
task_unlock(current);
} else {
- *user = NULL;
+ *ucounts = NULL;
return ERR_PTR(-EPERM);
}
}
@@ -1497,9 +1498,9 @@ struct file *hugetlb_file_setup(const char *name, size_t size,
iput(inode);
out:
- if (*user) {
- user_shm_unlock(size, *user);
- *user = NULL;
+ if (*ucounts) {
+ user_shm_unlock(size, *ucounts);
+ *ucounts = NULL;
}
return file;
}
diff --git a/fs/io-wq.c b/fs/io-wq.c
index 5361a9b4b47b..843d4a7bcd6e 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -9,8 +9,6 @@
#include <linux/init.h>
#include <linux/errno.h>
#include <linux/sched/signal.h>
-#include <linux/mm.h>
-#include <linux/sched/mm.h>
#include <linux/percpu.h>
#include <linux/slab.h>
#include <linux/rculist_nulls.h>
@@ -96,13 +94,14 @@ struct io_wqe {
struct io_wq *wq;
struct io_wq_work *hash_tail[IO_WQ_NR_HASH_BUCKETS];
+
+ cpumask_var_t cpu_mask;
};
/*
* Per io_wq state
*/
struct io_wq {
- struct io_wqe **wqes;
unsigned long state;
free_work_fn *free_work;
@@ -110,14 +109,14 @@ struct io_wq {
struct io_wq_hash *hash;
- refcount_t refs;
-
atomic_t worker_refs;
struct completion worker_done;
struct hlist_node cpuhp_node;
struct task_struct *task;
+
+ struct io_wqe *wqes[];
};
static enum cpuhp_state io_wq_online;
@@ -241,7 +240,8 @@ static void io_wqe_wake_worker(struct io_wqe *wqe, struct io_wqe_acct *acct)
* Most likely an attempt to queue unbounded work on an io_wq that
* wasn't setup with any unbounded workers.
*/
- WARN_ON_ONCE(!acct->max_workers);
+ if (unlikely(!acct->max_workers))
+ pr_warn_once("io-wq is not configured for unbound workers");
rcu_read_lock();
ret = io_wqe_activate_free_worker(wqe);
@@ -560,17 +560,13 @@ loop:
if (ret)
continue;
/* timed out, exit unless we're the fixed worker */
- if (test_bit(IO_WQ_BIT_EXIT, &wq->state) ||
- !(worker->flags & IO_WORKER_F_FIXED))
+ if (!(worker->flags & IO_WORKER_F_FIXED))
break;
}
if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) {
raw_spin_lock_irq(&wqe->lock);
- if (!wq_list_empty(&wqe->work_list))
- io_worker_handle_work(worker);
- else
- raw_spin_unlock_irq(&wqe->lock);
+ io_worker_handle_work(worker);
}
io_worker_exit(worker);
@@ -645,7 +641,7 @@ fail:
tsk->pf_io_worker = worker;
worker->task = tsk;
- set_cpus_allowed_ptr(tsk, cpumask_of_node(wqe->node));
+ set_cpus_allowed_ptr(tsk, wqe->cpu_mask);
tsk->flags |= PF_NO_SETAFFINITY;
raw_spin_lock_irq(&wqe->lock);
@@ -901,23 +897,20 @@ static int io_wqe_hash_wake(struct wait_queue_entry *wait, unsigned mode,
struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
{
- int ret = -ENOMEM, node;
+ int ret, node;
struct io_wq *wq;
if (WARN_ON_ONCE(!data->free_work || !data->do_work))
return ERR_PTR(-EINVAL);
+ if (WARN_ON_ONCE(!bounded))
+ return ERR_PTR(-EINVAL);
- wq = kzalloc(sizeof(*wq), GFP_KERNEL);
+ wq = kzalloc(struct_size(wq, wqes, nr_node_ids), GFP_KERNEL);
if (!wq)
return ERR_PTR(-ENOMEM);
-
- wq->wqes = kcalloc(nr_node_ids, sizeof(struct io_wqe *), GFP_KERNEL);
- if (!wq->wqes)
- goto err_wq;
-
ret = cpuhp_state_add_instance_nocalls(io_wq_online, &wq->cpuhp_node);
if (ret)
- goto err_wqes;
+ goto err_wq;
refcount_inc(&data->hash->refs);
wq->hash = data->hash;
@@ -934,6 +927,9 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
wqe = kzalloc_node(sizeof(struct io_wqe), GFP_KERNEL, alloc_node);
if (!wqe)
goto err;
+ if (!alloc_cpumask_var(&wqe->cpu_mask, GFP_KERNEL))
+ goto err;
+ cpumask_copy(wqe->cpu_mask, cpumask_of_node(node));
wq->wqes[node] = wqe;
wqe->node = alloc_node;
wqe->acct[IO_WQ_ACCT_BOUND].index = IO_WQ_ACCT_BOUND;
@@ -953,17 +949,18 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
}
wq->task = get_task_struct(data->task);
- refcount_set(&wq->refs, 1);
atomic_set(&wq->worker_refs, 1);
init_completion(&wq->worker_done);
return wq;
err:
io_wq_put_hash(data->hash);
cpuhp_state_remove_instance_nocalls(io_wq_online, &wq->cpuhp_node);
- for_each_node(node)
+ for_each_node(node) {
+ if (!wq->wqes[node])
+ continue;
+ free_cpumask_var(wq->wqes[node]->cpu_mask);
kfree(wq->wqes[node]);
-err_wqes:
- kfree(wq->wqes);
+ }
err_wq:
kfree(wq);
return ERR_PTR(ret);
@@ -979,13 +976,16 @@ static bool io_task_work_match(struct callback_head *cb, void *data)
return cwd->wqe->wq == data;
}
+void io_wq_exit_start(struct io_wq *wq)
+{
+ set_bit(IO_WQ_BIT_EXIT, &wq->state);
+}
+
static void io_wq_exit_workers(struct io_wq *wq)
{
struct callback_head *cb;
int node;
- set_bit(IO_WQ_BIT_EXIT, &wq->state);
-
if (!wq->task)
return;
@@ -1003,13 +1003,16 @@ static void io_wq_exit_workers(struct io_wq *wq)
struct io_wqe *wqe = wq->wqes[node];
io_wq_for_each_worker(wqe, io_wq_worker_wake, NULL);
- spin_lock_irq(&wq->hash->wait.lock);
- list_del_init(&wq->wqes[node]->wait.entry);
- spin_unlock_irq(&wq->hash->wait.lock);
}
rcu_read_unlock();
io_worker_ref_put(wq);
wait_for_completion(&wq->worker_done);
+
+ for_each_node(node) {
+ spin_lock_irq(&wq->hash->wait.lock);
+ list_del_init(&wq->wqes[node]->wait.entry);
+ spin_unlock_irq(&wq->hash->wait.lock);
+ }
put_task_struct(wq->task);
wq->task = NULL;
}
@@ -1020,8 +1023,6 @@ static void io_wq_destroy(struct io_wq *wq)
cpuhp_state_remove_instance_nocalls(io_wq_online, &wq->cpuhp_node);
- io_wq_exit_workers(wq);
-
for_each_node(node) {
struct io_wqe *wqe = wq->wqes[node];
struct io_cb_cancel_data match = {
@@ -1029,40 +1030,79 @@ static void io_wq_destroy(struct io_wq *wq)
.cancel_all = true,
};
io_wqe_cancel_pending_work(wqe, &match);
+ free_cpumask_var(wqe->cpu_mask);
kfree(wqe);
}
io_wq_put_hash(wq->hash);
- kfree(wq->wqes);
kfree(wq);
}
-void io_wq_put(struct io_wq *wq)
-{
- if (refcount_dec_and_test(&wq->refs))
- io_wq_destroy(wq);
-}
-
void io_wq_put_and_exit(struct io_wq *wq)
{
+ WARN_ON_ONCE(!test_bit(IO_WQ_BIT_EXIT, &wq->state));
+
io_wq_exit_workers(wq);
- io_wq_put(wq);
+ io_wq_destroy(wq);
}
+struct online_data {
+ unsigned int cpu;
+ bool online;
+};
+
static bool io_wq_worker_affinity(struct io_worker *worker, void *data)
{
- set_cpus_allowed_ptr(worker->task, cpumask_of_node(worker->wqe->node));
+ struct online_data *od = data;
+ if (od->online)
+ cpumask_set_cpu(od->cpu, worker->wqe->cpu_mask);
+ else
+ cpumask_clear_cpu(od->cpu, worker->wqe->cpu_mask);
return false;
}
+static int __io_wq_cpu_online(struct io_wq *wq, unsigned int cpu, bool online)
+{
+ struct online_data od = {
+ .cpu = cpu,
+ .online = online
+ };
+ int i;
+
+ rcu_read_lock();
+ for_each_node(i)
+ io_wq_for_each_worker(wq->wqes[i], io_wq_worker_affinity, &od);
+ rcu_read_unlock();
+ return 0;
+}
+
static int io_wq_cpu_online(unsigned int cpu, struct hlist_node *node)
{
struct io_wq *wq = hlist_entry_safe(node, struct io_wq, cpuhp_node);
+
+ return __io_wq_cpu_online(wq, cpu, true);
+}
+
+static int io_wq_cpu_offline(unsigned int cpu, struct hlist_node *node)
+{
+ struct io_wq *wq = hlist_entry_safe(node, struct io_wq, cpuhp_node);
+
+ return __io_wq_cpu_online(wq, cpu, false);
+}
+
+int io_wq_cpu_affinity(struct io_wq *wq, cpumask_var_t mask)
+{
int i;
rcu_read_lock();
- for_each_node(i)
- io_wq_for_each_worker(wq->wqes[i], io_wq_worker_affinity, NULL);
+ for_each_node(i) {
+ struct io_wqe *wqe = wq->wqes[i];
+
+ if (mask)
+ cpumask_copy(wqe->cpu_mask, mask);
+ else
+ cpumask_copy(wqe->cpu_mask, cpumask_of_node(i));
+ }
rcu_read_unlock();
return 0;
}
@@ -1072,7 +1112,7 @@ static __init int io_wq_init(void)
int ret;
ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "io-wq/online",
- io_wq_cpu_online, NULL);
+ io_wq_cpu_online, io_wq_cpu_offline);
if (ret < 0)
return ret;
io_wq_online = ret;
diff --git a/fs/io-wq.h b/fs/io-wq.h
index 0e6d310999e8..3999ee58ff26 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -87,7 +87,6 @@ static inline void wq_list_del(struct io_wq_work_list *list,
struct io_wq_work {
struct io_wq_work_node list;
- const struct cred *creds;
unsigned flags;
};
@@ -122,12 +121,14 @@ struct io_wq_data {
};
struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data);
-void io_wq_put(struct io_wq *wq);
+void io_wq_exit_start(struct io_wq *wq);
void io_wq_put_and_exit(struct io_wq *wq);
void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work);
void io_wq_hash_work(struct io_wq_work *work, void *val);
+int io_wq_cpu_affinity(struct io_wq *wq, cpumask_var_t mask);
+
static inline bool io_wq_is_hashed(struct io_wq_work *work)
{
return work->flags & IO_WQ_WORK_HASHED;
diff --git a/fs/io_uring.c b/fs/io_uring.c
index e481ac8a757a..e55b21fc0ab2 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -11,7 +11,7 @@
* before writing the tail (using smp_load_acquire to read the tail will
* do). It also needs a smp_mb() before updating CQ head (ordering the
* entry load(s) with the head store), pairing with an implicit barrier
- * through a control-dependency in io_get_cqring (smp_store_release to
+ * through a control-dependency in io_get_cqe (smp_store_release to
* store head will do). Failure to do so could lead to reading invalid
* CQ entries.
*
@@ -89,6 +89,7 @@
#define IORING_MAX_ENTRIES 32768
#define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES)
+#define IORING_SQPOLL_CAP_ENTRIES_VALUE 8
/*
* Shift of 9 is 512 entries, or exactly one page on 64-bit archs
@@ -100,11 +101,19 @@
#define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
IORING_REGISTER_LAST + IORING_OP_LAST)
+#define IO_RSRC_TAG_TABLE_SHIFT 9
+#define IO_RSRC_TAG_TABLE_MAX (1U << IO_RSRC_TAG_TABLE_SHIFT)
+#define IO_RSRC_TAG_TABLE_MASK (IO_RSRC_TAG_TABLE_MAX - 1)
+
#define IORING_MAX_REG_BUFFERS (1U << 14)
#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \
IOSQE_IO_HARDLINK | IOSQE_ASYNC | \
IOSQE_BUFFER_SELECT)
+#define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \
+ REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS)
+
+#define IO_TCTX_REFS_CACHE_NR (1U << 10)
struct io_uring {
u32 head ____cacheline_aligned_in_smp;
@@ -164,7 +173,7 @@ struct io_rings {
* Written by the application, shouldn't be modified by the
* kernel.
*/
- u32 cq_flags;
+ u32 cq_flags;
/*
* Number of completion events lost because the queue was full;
* this should be avoided by the application by making sure
@@ -243,7 +252,8 @@ typedef void (rsrc_put_fn)(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc);
struct io_rsrc_data {
struct io_ring_ctx *ctx;
- u64 *tags;
+ u64 **tags;
+ unsigned int nr;
rsrc_put_fn *do_put;
atomic_t refs;
struct completion done;
@@ -288,7 +298,6 @@ struct io_sq_data {
unsigned long state;
struct completion exited;
- struct callback_head *park_task_work;
};
#define IO_IOPOLL_BATCH 8
@@ -299,11 +308,8 @@ struct io_sq_data {
struct io_comp_state {
struct io_kiocb *reqs[IO_COMPL_BATCH];
unsigned int nr;
- unsigned int locked_free_nr;
/* inline/task_work completion list, under ->uring_lock */
struct list_head free_list;
- /* IRQ completion list, under ->completion_lock */
- struct list_head locked_free_list;
};
struct io_submit_link {
@@ -338,16 +344,23 @@ struct io_submit_state {
};
struct io_ring_ctx {
+ /* const or read-mostly hot data */
struct {
struct percpu_ref refs;
- } ____cacheline_aligned_in_smp;
- struct {
+ struct io_rings *rings;
unsigned int flags;
unsigned int compat: 1;
unsigned int drain_next: 1;
unsigned int eventfd_async: 1;
unsigned int restricted: 1;
+ unsigned int off_timeout_used: 1;
+ unsigned int drain_active: 1;
+ } ____cacheline_aligned_in_smp;
+
+ /* submission data */
+ struct {
+ struct mutex uring_lock;
/*
* Ring buffer of indices into array of io_uring_sqe, which is
@@ -361,35 +374,33 @@ struct io_ring_ctx {
* array.
*/
u32 *sq_array;
+ struct io_uring_sqe *sq_sqes;
unsigned cached_sq_head;
unsigned sq_entries;
- unsigned sq_mask;
- unsigned sq_thread_idle;
- unsigned cached_sq_dropped;
- unsigned cached_cq_overflow;
- unsigned long sq_check_overflow;
+ struct list_head defer_list;
- /* hashed buffered write serialization */
- struct io_wq_hash *hash_map;
+ /*
+ * Fixed resources fast path, should be accessed only under
+ * uring_lock, and updated through io_uring_register(2)
+ */
+ struct io_rsrc_node *rsrc_node;
+ struct io_file_table file_table;
+ unsigned nr_user_files;
+ unsigned nr_user_bufs;
+ struct io_mapped_ubuf **user_bufs;
- struct list_head defer_list;
+ struct io_submit_state submit_state;
struct list_head timeout_list;
struct list_head cq_overflow_list;
-
- struct io_uring_sqe *sq_sqes;
- } ____cacheline_aligned_in_smp;
-
- struct {
- struct mutex uring_lock;
- wait_queue_head_t wait;
+ struct xarray io_buffers;
+ struct xarray personalities;
+ u32 pers_next;
+ unsigned sq_thread_idle;
} ____cacheline_aligned_in_smp;
- struct io_submit_state submit_state;
-
- struct io_rings *rings;
-
- /* Only used for accounting purposes */
- struct mm_struct *mm_account;
+ /* IRQ completion list, under ->completion_lock */
+ struct list_head locked_free_list;
+ unsigned int locked_free_nr;
const struct cred *sq_creds; /* cred used for __io_sq_thread() */
struct io_sq_data *sq_data; /* if using sq thread polling */
@@ -397,44 +408,18 @@ struct io_ring_ctx {
struct wait_queue_head sqo_sq_wait;
struct list_head sqd_list;
- /*
- * If used, fixed file set. Writers must ensure that ->refs is dead,
- * readers must ensure that ->refs is alive as long as the file* is
- * used. Only updated through io_uring_register(2).
- */
- struct io_rsrc_data *file_data;
- struct io_file_table file_table;
- unsigned nr_user_files;
-
- /* if used, fixed mapped user buffers */
- struct io_rsrc_data *buf_data;
- unsigned nr_user_bufs;
- struct io_mapped_ubuf **user_bufs;
-
- struct user_struct *user;
-
- struct completion ref_comp;
-
-#if defined(CONFIG_UNIX)
- struct socket *ring_sock;
-#endif
-
- struct xarray io_buffers;
-
- struct xarray personalities;
- u32 pers_next;
+ unsigned long check_cq_overflow;
struct {
unsigned cached_cq_tail;
unsigned cq_entries;
- unsigned cq_mask;
- atomic_t cq_timeouts;
- unsigned cq_last_tm_flush;
- unsigned cq_extra;
- unsigned long cq_check_overflow;
+ struct eventfd_ctx *cq_ev_fd;
+ struct wait_queue_head poll_wait;
struct wait_queue_head cq_wait;
+ unsigned cq_extra;
+ atomic_t cq_timeouts;
struct fasync_struct *cq_fasync;
- struct eventfd_ctx *cq_ev_fd;
+ unsigned cq_last_tm_flush;
} ____cacheline_aligned_in_smp;
struct {
@@ -449,29 +434,47 @@ struct io_ring_ctx {
struct list_head iopoll_list;
struct hlist_head *cancel_hash;
unsigned cancel_hash_bits;
- bool poll_multi_file;
+ bool poll_multi_queue;
} ____cacheline_aligned_in_smp;
- struct delayed_work rsrc_put_work;
- struct llist_head rsrc_put_llist;
- struct list_head rsrc_ref_list;
- spinlock_t rsrc_ref_lock;
- struct io_rsrc_node *rsrc_node;
- struct io_rsrc_node *rsrc_backup_node;
- struct io_mapped_ubuf *dummy_ubuf;
-
struct io_restriction restrictions;
- /* exit task_work */
- struct callback_head *exit_task_work;
+ /* slow path rsrc auxilary data, used by update/register */
+ struct {
+ struct io_rsrc_node *rsrc_backup_node;
+ struct io_mapped_ubuf *dummy_ubuf;
+ struct io_rsrc_data *file_data;
+ struct io_rsrc_data *buf_data;
+
+ struct delayed_work rsrc_put_work;
+ struct llist_head rsrc_put_llist;
+ struct list_head rsrc_ref_list;
+ spinlock_t rsrc_ref_lock;
+ };
/* Keep this last, we don't need it for the fast path */
- struct work_struct exit_work;
- struct list_head tctx_list;
+ struct {
+ #if defined(CONFIG_UNIX)
+ struct socket *ring_sock;
+ #endif
+ /* hashed buffered write serialization */
+ struct io_wq_hash *hash_map;
+
+ /* Only used for accounting purposes */
+ struct user_struct *user;
+ struct mm_struct *mm_account;
+
+ /* ctx exit and cancelation */
+ struct callback_head *exit_task_work;
+ struct work_struct exit_work;
+ struct list_head tctx_list;
+ struct completion ref_comp;
+ };
};
struct io_uring_task {
/* submission side */
+ int cached_refs;
struct xarray xa;
struct wait_queue_head wait;
const struct io_ring_ctx *last;
@@ -706,7 +709,7 @@ enum {
REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT,
/* first byte is taken by user flags, shift it to not overlap */
- REQ_F_FAIL_LINK_BIT = 8,
+ REQ_F_FAIL_BIT = 8,
REQ_F_INFLIGHT_BIT,
REQ_F_CUR_POS_BIT,
REQ_F_NOWAIT_BIT,
@@ -718,6 +721,7 @@ enum {
REQ_F_COMPLETE_INLINE_BIT,
REQ_F_REISSUE_BIT,
REQ_F_DONT_REISSUE_BIT,
+ REQ_F_CREDS_BIT,
/* keep async read/write and isreg together and in order */
REQ_F_ASYNC_READ_BIT,
REQ_F_ASYNC_WRITE_BIT,
@@ -742,7 +746,7 @@ enum {
REQ_F_BUFFER_SELECT = BIT(REQ_F_BUFFER_SELECT_BIT),
/* fail rest of links */
- REQ_F_FAIL_LINK = BIT(REQ_F_FAIL_LINK_BIT),
+ REQ_F_FAIL = BIT(REQ_F_FAIL_BIT),
/* on inflight list, should be cancelled and waited on exit reliably */
REQ_F_INFLIGHT = BIT(REQ_F_INFLIGHT_BIT),
/* read/write uses file position */
@@ -771,6 +775,8 @@ enum {
REQ_F_ASYNC_WRITE = BIT(REQ_F_ASYNC_WRITE_BIT),
/* regular file */
REQ_F_ISREG = BIT(REQ_F_ISREG_BIT),
+ /* has creds assigned */
+ REQ_F_CREDS = BIT(REQ_F_CREDS_BIT),
};
struct async_poll {
@@ -783,6 +789,11 @@ struct io_task_work {
task_work_func_t func;
};
+enum {
+ IORING_RSRC_FILE = 0,
+ IORING_RSRC_BUFFER = 1,
+};
+
/*
* NOTE! Each of the iocb union members has the file pointer
* as the first entry in their struct definition. So you can
@@ -846,6 +857,8 @@ struct io_kiocb {
struct hlist_node hash_node;
struct async_poll *apoll;
struct io_wq_work work;
+ const struct cred *creds;
+
/* store used ubuf, so we can prevent reloading */
struct io_mapped_ubuf *imu;
};
@@ -1029,11 +1042,11 @@ static const struct io_op_def io_op_defs[] = {
};
static bool io_disarm_next(struct io_kiocb *req);
-static void io_uring_del_task_file(unsigned long index);
+static void io_uring_del_tctx_node(unsigned long index);
static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
struct task_struct *task,
- struct files_struct *files);
-static void io_uring_cancel_sqpoll(struct io_sq_data *sqd);
+ bool cancel_all);
+static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd);
static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx);
static bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data,
@@ -1054,8 +1067,7 @@ static void __io_queue_sqe(struct io_kiocb *req);
static void io_rsrc_put_work(struct work_struct *work);
static void io_req_task_queue(struct io_kiocb *req);
-static void io_submit_flush_completions(struct io_comp_state *cs,
- struct io_ring_ctx *ctx);
+static void io_submit_flush_completions(struct io_ring_ctx *ctx);
static bool io_poll_remove_waitqs(struct io_kiocb *req);
static int io_req_prep_async(struct io_kiocb *req);
@@ -1101,15 +1113,14 @@ static void io_refs_resurrect(struct percpu_ref *ref, struct completion *compl)
percpu_ref_put(ref);
}
-static bool io_match_task(struct io_kiocb *head,
- struct task_struct *task,
- struct files_struct *files)
+static bool io_match_task(struct io_kiocb *head, struct task_struct *task,
+ bool cancel_all)
{
struct io_kiocb *req;
if (task && head->task != task)
return false;
- if (!files)
+ if (cancel_all)
return true;
io_for_each_link(req, head) {
@@ -1119,10 +1130,9 @@ static bool io_match_task(struct io_kiocb *head,
return false;
}
-static inline void req_set_fail_links(struct io_kiocb *req)
+static inline void req_set_fail(struct io_kiocb *req)
{
- if (req->flags & REQ_F_LINK)
- req->flags |= REQ_F_FAIL_LINK;
+ req->flags |= REQ_F_FAIL;
}
static void io_ring_ctx_ref_free(struct percpu_ref *ref)
@@ -1174,13 +1184,13 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
ctx->flags = p->flags;
init_waitqueue_head(&ctx->sqo_sq_wait);
INIT_LIST_HEAD(&ctx->sqd_list);
- init_waitqueue_head(&ctx->cq_wait);
+ init_waitqueue_head(&ctx->poll_wait);
INIT_LIST_HEAD(&ctx->cq_overflow_list);
init_completion(&ctx->ref_comp);
xa_init_flags(&ctx->io_buffers, XA_FLAGS_ALLOC1);
xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1);
mutex_init(&ctx->uring_lock);
- init_waitqueue_head(&ctx->wait);
+ init_waitqueue_head(&ctx->cq_wait);
spin_lock_init(&ctx->completion_lock);
INIT_LIST_HEAD(&ctx->iopoll_list);
INIT_LIST_HEAD(&ctx->defer_list);
@@ -1191,7 +1201,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
init_llist_head(&ctx->rsrc_put_llist);
INIT_LIST_HEAD(&ctx->tctx_list);
INIT_LIST_HEAD(&ctx->submit_state.comp.free_list);
- INIT_LIST_HEAD(&ctx->submit_state.comp.locked_free_list);
+ INIT_LIST_HEAD(&ctx->locked_free_list);
return ctx;
err:
kfree(ctx->dummy_ubuf);
@@ -1200,13 +1210,20 @@ err:
return NULL;
}
+static void io_account_cq_overflow(struct io_ring_ctx *ctx)
+{
+ struct io_rings *r = ctx->rings;
+
+ WRITE_ONCE(r->cq_overflow, READ_ONCE(r->cq_overflow) + 1);
+ ctx->cq_extra--;
+}
+
static bool req_need_defer(struct io_kiocb *req, u32 seq)
{
if (unlikely(req->flags & REQ_F_IO_DRAIN)) {
struct io_ring_ctx *ctx = req->ctx;
- return seq + ctx->cq_extra != ctx->cached_cq_tail
- + READ_ONCE(ctx->cached_cq_overflow);
+ return seq + READ_ONCE(ctx->cq_extra) != ctx->cached_cq_tail;
}
return false;
@@ -1225,8 +1242,10 @@ static void io_prep_async_work(struct io_kiocb *req)
const struct io_op_def *def = &io_op_defs[req->opcode];
struct io_ring_ctx *ctx = req->ctx;
- if (!req->work.creds)
- req->work.creds = get_current_cred();
+ if (!(req->flags & REQ_F_CREDS)) {
+ req->flags |= REQ_F_CREDS;
+ req->creds = get_current_cred();
+ }
req->work.list.next = NULL;
req->work.flags = 0;
@@ -1290,9 +1309,9 @@ static void io_kill_timeout(struct io_kiocb *req, int status)
}
}
-static void __io_queue_deferred(struct io_ring_ctx *ctx)
+static void io_queue_deferred(struct io_ring_ctx *ctx)
{
- do {
+ while (!list_empty(&ctx->defer_list)) {
struct io_defer_entry *de = list_first_entry(&ctx->defer_list,
struct io_defer_entry, list);
@@ -1301,19 +1320,14 @@ static void __io_queue_deferred(struct io_ring_ctx *ctx)
list_del_init(&de->list);
io_req_task_queue(de->req);
kfree(de);
- } while (!list_empty(&ctx->defer_list));
+ }
}
static void io_flush_timeouts(struct io_ring_ctx *ctx)
{
- u32 seq;
+ u32 seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
- if (list_empty(&ctx->timeout_list))
- return;
-
- seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
-
- do {
+ while (!list_empty(&ctx->timeout_list)) {
u32 events_needed, events_got;
struct io_kiocb *req = list_first_entry(&ctx->timeout_list,
struct io_kiocb, timeout.list);
@@ -1335,27 +1349,31 @@ static void io_flush_timeouts(struct io_ring_ctx *ctx)
list_del_init(&req->timeout.list);
io_kill_timeout(req, 0);
- } while (!list_empty(&ctx->timeout_list));
-
+ }
ctx->cq_last_tm_flush = seq;
}
-static void io_commit_cqring(struct io_ring_ctx *ctx)
+static void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
{
- io_flush_timeouts(ctx);
+ if (ctx->off_timeout_used)
+ io_flush_timeouts(ctx);
+ if (ctx->drain_active)
+ io_queue_deferred(ctx);
+}
+static inline void io_commit_cqring(struct io_ring_ctx *ctx)
+{
+ if (unlikely(ctx->off_timeout_used || ctx->drain_active))
+ __io_commit_cqring_flush(ctx);
/* order cqe stores with ring update */
smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail);
-
- if (unlikely(!list_empty(&ctx->defer_list)))
- __io_queue_deferred(ctx);
}
static inline bool io_sqring_full(struct io_ring_ctx *ctx)
{
struct io_rings *r = ctx->rings;
- return READ_ONCE(r->sq.tail) - ctx->cached_sq_head == r->sq_ring_entries;
+ return READ_ONCE(r->sq.tail) - ctx->cached_sq_head == ctx->sq_entries;
}
static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx)
@@ -1363,21 +1381,21 @@ static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx)
return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head);
}
-static inline struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
+static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx)
{
struct io_rings *rings = ctx->rings;
- unsigned tail;
+ unsigned tail, mask = ctx->cq_entries - 1;
/*
* writes to the cq entry need to come after reading head; the
* control dependency is enough as we're using WRITE_ONCE to
* fill the cq entry
*/
- if (__io_cqring_events(ctx) == rings->cq_ring_entries)
+ if (__io_cqring_events(ctx) == ctx->cq_entries)
return NULL;
tail = ctx->cached_cq_tail++;
- return &rings->cqes[tail & ctx->cq_mask];
+ return &rings->cqes[tail & mask];
}
static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx)
@@ -1394,14 +1412,14 @@ static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
/* see waitqueue_active() comment */
smp_mb();
- if (waitqueue_active(&ctx->wait))
- wake_up(&ctx->wait);
+ if (waitqueue_active(&ctx->cq_wait))
+ wake_up(&ctx->cq_wait);
if (ctx->sq_data && waitqueue_active(&ctx->sq_data->wait))
wake_up(&ctx->sq_data->wait);
if (io_should_trigger_evfd(ctx))
eventfd_signal(ctx->cq_ev_fd, 1);
- if (waitqueue_active(&ctx->cq_wait)) {
- wake_up_interruptible(&ctx->cq_wait);
+ if (waitqueue_active(&ctx->poll_wait)) {
+ wake_up_interruptible(&ctx->poll_wait);
kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN);
}
}
@@ -1412,13 +1430,13 @@ static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
smp_mb();
if (ctx->flags & IORING_SETUP_SQPOLL) {
- if (waitqueue_active(&ctx->wait))
- wake_up(&ctx->wait);
+ if (waitqueue_active(&ctx->cq_wait))
+ wake_up(&ctx->cq_wait);
}
if (io_should_trigger_evfd(ctx))
eventfd_signal(ctx->cq_ev_fd, 1);
- if (waitqueue_active(&ctx->cq_wait)) {
- wake_up_interruptible(&ctx->cq_wait);
+ if (waitqueue_active(&ctx->poll_wait)) {
+ wake_up_interruptible(&ctx->poll_wait);
kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN);
}
}
@@ -1426,17 +1444,16 @@ static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
/* Returns true if there are no backlogged entries after the flush */
static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
{
- struct io_rings *rings = ctx->rings;
unsigned long flags;
bool all_flushed, posted;
- if (!force && __io_cqring_events(ctx) == rings->cq_ring_entries)
+ if (!force && __io_cqring_events(ctx) == ctx->cq_entries)
return false;
posted = false;
spin_lock_irqsave(&ctx->completion_lock, flags);
while (!list_empty(&ctx->cq_overflow_list)) {
- struct io_uring_cqe *cqe = io_get_cqring(ctx);
+ struct io_uring_cqe *cqe = io_get_cqe(ctx);
struct io_overflow_cqe *ocqe;
if (!cqe && !force)
@@ -1446,8 +1463,8 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
if (cqe)
memcpy(cqe, &ocqe->cqe, sizeof(*cqe));
else
- WRITE_ONCE(ctx->rings->cq_overflow,
- ++ctx->cached_cq_overflow);
+ io_account_cq_overflow(ctx);
+
posted = true;
list_del(&ocqe->list);
kfree(ocqe);
@@ -1455,8 +1472,7 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
all_flushed = list_empty(&ctx->cq_overflow_list);
if (all_flushed) {
- clear_bit(0, &ctx->sq_check_overflow);
- clear_bit(0, &ctx->cq_check_overflow);
+ clear_bit(0, &ctx->check_cq_overflow);
ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW;
}
@@ -1472,7 +1488,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
{
bool ret = true;
- if (test_bit(0, &ctx->cq_check_overflow)) {
+ if (test_bit(0, &ctx->check_cq_overflow)) {
/* iopoll syncs against uring_lock, not completion_lock */
if (ctx->flags & IORING_SETUP_IOPOLL)
mutex_lock(&ctx->uring_lock);
@@ -1531,12 +1547,11 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
* or cannot allocate an overflow entry, then we need to drop it
* on the floor.
*/
- WRITE_ONCE(ctx->rings->cq_overflow, ++ctx->cached_cq_overflow);
+ io_account_cq_overflow(ctx);
return false;
}
if (list_empty(&ctx->cq_overflow_list)) {
- set_bit(0, &ctx->sq_check_overflow);
- set_bit(0, &ctx->cq_check_overflow);
+ set_bit(0, &ctx->check_cq_overflow);
ctx->rings->sq_flags |= IORING_SQ_CQ_OVERFLOW;
}
ocqe->cqe.user_data = user_data;
@@ -1558,7 +1573,7 @@ static inline bool __io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data
* submission (by quite a lot). Increment the overflow count in
* the ring.
*/
- cqe = io_get_cqring(ctx);
+ cqe = io_get_cqe(ctx);
if (likely(cqe)) {
WRITE_ONCE(cqe->user_data, user_data);
WRITE_ONCE(cqe->res, res);
@@ -1588,10 +1603,8 @@ static void io_req_complete_post(struct io_kiocb *req, long res,
* free_list cache.
*/
if (req_ref_put_and_test(req)) {
- struct io_comp_state *cs = &ctx->submit_state.comp;
-
if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
- if (req->flags & (REQ_F_LINK_TIMEOUT | REQ_F_FAIL_LINK))
+ if (req->flags & (REQ_F_LINK_TIMEOUT | REQ_F_FAIL))
io_disarm_next(req);
if (req->link) {
io_req_task_queue(req->link);
@@ -1600,8 +1613,8 @@ static void io_req_complete_post(struct io_kiocb *req, long res,
}
io_dismantle_req(req);
io_put_task(req->task, 1);
- list_add(&req->compl.list, &cs->locked_free_list);
- cs->locked_free_nr++;
+ list_add(&req->compl.list, &ctx->locked_free_list);
+ ctx->locked_free_nr++;
} else {
if (!percpu_ref_tryget(&ctx->refs))
req = NULL;
@@ -1617,8 +1630,7 @@ static void io_req_complete_post(struct io_kiocb *req, long res,
static inline bool io_req_needs_clean(struct io_kiocb *req)
{
- return req->flags & (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP |
- REQ_F_POLLED | REQ_F_INFLIGHT);
+ return req->flags & IO_REQ_CLEAN_FLAGS;
}
static void io_req_complete_state(struct io_kiocb *req, long res,
@@ -1647,7 +1659,7 @@ static inline void io_req_complete(struct io_kiocb *req, long res)
static void io_req_complete_failed(struct io_kiocb *req, long res)
{
- req_set_fail_links(req);
+ req_set_fail(req);
io_put_req(req);
io_req_complete_post(req, res, 0);
}
@@ -1656,8 +1668,8 @@ static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx,
struct io_comp_state *cs)
{
spin_lock_irq(&ctx->completion_lock);
- list_splice_init(&cs->locked_free_list, &cs->free_list);
- cs->locked_free_nr = 0;
+ list_splice_init(&ctx->locked_free_list, &cs->free_list);
+ ctx->locked_free_nr = 0;
spin_unlock_irq(&ctx->completion_lock);
}
@@ -1673,7 +1685,7 @@ static bool io_flush_cached_reqs(struct io_ring_ctx *ctx)
* locked cache, grab the lock and move them over to our submission
* side cache.
*/
- if (READ_ONCE(cs->locked_free_nr) > IO_COMPL_BATCH)
+ if (READ_ONCE(ctx->locked_free_nr) > IO_COMPL_BATCH)
io_flush_cached_locked_reqs(ctx, cs);
nr = state->free_reqs;
@@ -1695,11 +1707,11 @@ static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx)
{
struct io_submit_state *state = &ctx->submit_state;
- BUILD_BUG_ON(IO_REQ_ALLOC_BATCH > ARRAY_SIZE(state->reqs));
+ BUILD_BUG_ON(ARRAY_SIZE(state->reqs) < IO_REQ_ALLOC_BATCH);
if (!state->free_reqs) {
gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
- int ret;
+ int ret, i;
if (io_flush_cached_reqs(ctx))
goto got_req;
@@ -1717,6 +1729,20 @@ static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx)
return NULL;
ret = 1;
}
+
+ /*
+ * Don't initialise the fields below on every allocation, but
+ * do that in advance and keep valid on free.
+ */
+ for (i = 0; i < ret; i++) {
+ struct io_kiocb *req = state->reqs[i];
+
+ req->ctx = ctx;
+ req->link = NULL;
+ req->async_data = NULL;
+ /* not necessary, but safer to zero */
+ req->result = 0;
+ }
state->free_reqs = ret;
}
got_req:
@@ -1740,11 +1766,9 @@ static void io_dismantle_req(struct io_kiocb *req)
io_put_file(req->file);
if (req->fixed_rsrc_refs)
percpu_ref_put(req->fixed_rsrc_refs);
- if (req->async_data)
+ if (req->async_data) {
kfree(req->async_data);
- if (req->work.creds) {
- put_cred(req->work.creds);
- req->work.creds = NULL;
+ req->async_data = NULL;
}
}
@@ -1826,7 +1850,7 @@ static bool io_disarm_next(struct io_kiocb *req)
if (likely(req->flags & REQ_F_LINK_TIMEOUT))
posted = io_kill_linked_timeout(req);
- if (unlikely((req->flags & REQ_F_FAIL_LINK) &&
+ if (unlikely((req->flags & REQ_F_FAIL) &&
!(req->flags & REQ_F_HARDLINK))) {
posted |= (req->link != NULL);
io_fail_links(req);
@@ -1844,7 +1868,7 @@ static struct io_kiocb *__io_req_find_next(struct io_kiocb *req)
* dependencies to the next request. In case of failure, fail the rest
* of the chain.
*/
- if (req->flags & (REQ_F_LINK_TIMEOUT | REQ_F_FAIL_LINK)) {
+ if (req->flags & (REQ_F_LINK_TIMEOUT | REQ_F_FAIL)) {
struct io_ring_ctx *ctx = req->ctx;
unsigned long flags;
bool posted;
@@ -1875,54 +1899,51 @@ static void ctx_flush_and_put(struct io_ring_ctx *ctx)
return;
if (ctx->submit_state.comp.nr) {
mutex_lock(&ctx->uring_lock);
- io_submit_flush_completions(&ctx->submit_state.comp, ctx);
+ io_submit_flush_completions(ctx);
mutex_unlock(&ctx->uring_lock);
}
percpu_ref_put(&ctx->refs);
}
-static bool __tctx_task_work(struct io_uring_task *tctx)
+static void tctx_task_work(struct callback_head *cb)
{
struct io_ring_ctx *ctx = NULL;
- struct io_wq_work_list list;
- struct io_wq_work_node *node;
-
- if (wq_list_empty(&tctx->task_list))
- return false;
-
- spin_lock_irq(&tctx->task_lock);
- list = tctx->task_list;
- INIT_WQ_LIST(&tctx->task_list);
- spin_unlock_irq(&tctx->task_lock);
+ struct io_uring_task *tctx = container_of(cb, struct io_uring_task,
+ task_work);
- node = list.first;
- while (node) {
- struct io_wq_work_node *next = node->next;
- struct io_kiocb *req;
-
- req = container_of(node, struct io_kiocb, io_task_work.node);
- if (req->ctx != ctx) {
- ctx_flush_and_put(ctx);
- ctx = req->ctx;
- percpu_ref_get(&ctx->refs);
+ while (1) {
+ struct io_wq_work_node *node;
+
+ spin_lock_irq(&tctx->task_lock);
+ node = tctx->task_list.first;
+ INIT_WQ_LIST(&tctx->task_list);
+ spin_unlock_irq(&tctx->task_lock);
+
+ while (node) {
+ struct io_wq_work_node *next = node->next;
+ struct io_kiocb *req = container_of(node, struct io_kiocb,
+ io_task_work.node);
+
+ if (req->ctx != ctx) {
+ ctx_flush_and_put(ctx);
+ ctx = req->ctx;
+ percpu_ref_get(&ctx->refs);
+ }
+ req->task_work.func(&req->task_work);
+ node = next;
}
-
- req->task_work.func(&req->task_work);
- node = next;
+ if (wq_list_empty(&tctx->task_list)) {
+ clear_bit(0, &tctx->task_state);
+ if (wq_list_empty(&tctx->task_list))
+ break;
+ /* another tctx_task_work() is enqueued, yield */
+ if (test_and_set_bit(0, &tctx->task_state))
+ break;
+ }
+ cond_resched();
}
ctx_flush_and_put(ctx);
- return list.first != NULL;
-}
-
-static void tctx_task_work(struct callback_head *cb)
-{
- struct io_uring_task *tctx = container_of(cb, struct io_uring_task, task_work);
-
- clear_bit(0, &tctx->task_state);
-
- while (__tctx_task_work(tctx))
- cond_resched();
}
static int io_req_task_work_add(struct io_kiocb *req)
@@ -2123,26 +2144,26 @@ static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req,
list_add(&req->compl.list, &state->comp.free_list);
}
-static void io_submit_flush_completions(struct io_comp_state *cs,
- struct io_ring_ctx *ctx)
+static void io_submit_flush_completions(struct io_ring_ctx *ctx)
{
+ struct io_comp_state *cs = &ctx->submit_state.comp;
int i, nr = cs->nr;
- struct io_kiocb *req;
struct req_batch rb;
- io_init_req_batch(&rb);
spin_lock_irq(&ctx->completion_lock);
for (i = 0; i < nr; i++) {
- req = cs->reqs[i];
+ struct io_kiocb *req = cs->reqs[i];
+
__io_cqring_fill_event(ctx, req->user_data, req->result,
req->compl.cflags);
}
io_commit_cqring(ctx);
spin_unlock_irq(&ctx->completion_lock);
-
io_cqring_ev_posted(ctx);
+
+ io_init_req_batch(&rb);
for (i = 0; i < nr; i++) {
- req = cs->reqs[i];
+ struct io_kiocb *req = cs->reqs[i];
/* submission and completion refs */
if (req_ref_sub_and_test(req, 2))
@@ -2230,12 +2251,6 @@ static inline unsigned int io_put_rw_kbuf(struct io_kiocb *req)
static inline bool io_run_task_work(void)
{
- /*
- * Not safe to run on exiting task, and the task_work handling will
- * not add work to such a task.
- */
- if (unlikely(current->flags & PF_EXITING))
- return false;
if (current->task_works) {
__set_current_state(TASK_RUNNING);
task_work_run();
@@ -2299,7 +2314,7 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
* Only spin for completions if we don't have multiple devices hanging
* off our complete list, and we're under the requested amount.
*/
- spin = !ctx->poll_multi_file && *nr_events < min;
+ spin = !ctx->poll_multi_queue && *nr_events < min;
ret = 0;
list_for_each_entry_safe(req, tmp, &ctx->iopoll_list, inflight_entry) {
@@ -2384,7 +2399,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
* If we do, we can potentially be spinning for commands that
* already triggered a CQE (eg in error).
*/
- if (test_bit(0, &ctx->cq_check_overflow))
+ if (test_bit(0, &ctx->check_cq_overflow))
__io_cqring_overflow_flush(ctx, false);
if (io_cqring_events(ctx))
goto out;
@@ -2483,7 +2498,7 @@ static void __io_complete_rw(struct io_kiocb *req, long res, long res2,
req->flags |= REQ_F_REISSUE;
return;
}
- req_set_fail_links(req);
+ req_set_fail(req);
}
if (req->flags & REQ_F_BUFFER_SELECTED)
cflags = io_put_rw_kbuf(req);
@@ -2506,7 +2521,7 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
if (unlikely(res != req->result)) {
if (!(res == -EAGAIN && io_rw_should_reissue(req) &&
io_resubmit_prep(req))) {
- req_set_fail_links(req);
+ req_set_fail(req);
req->flags |= REQ_F_DONT_REISSUE;
}
}
@@ -2523,9 +2538,14 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
* find it from a io_do_iopoll() thread before the issuer is done
* accessing the kiocb cookie.
*/
-static void io_iopoll_req_issued(struct io_kiocb *req, bool in_async)
+static void io_iopoll_req_issued(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
+ const bool in_async = io_wq_current_is_worker();
+
+ /* workqueue context doesn't hold uring_lock, grab it now */
+ if (unlikely(in_async))
+ mutex_lock(&ctx->uring_lock);
/*
* Track whether we have multiple files in our lists. This will impact
@@ -2533,14 +2553,22 @@ static void io_iopoll_req_issued(struct io_kiocb *req, bool in_async)
* different devices.
*/
if (list_empty(&ctx->iopoll_list)) {
- ctx->poll_multi_file = false;
- } else if (!ctx->poll_multi_file) {
+ ctx->poll_multi_queue = false;
+ } else if (!ctx->poll_multi_queue) {
struct io_kiocb *list_req;
+ unsigned int queue_num0, queue_num1;
list_req = list_first_entry(&ctx->iopoll_list, struct io_kiocb,
inflight_entry);
- if (list_req->file != req->file)
- ctx->poll_multi_file = true;
+
+ if (list_req->file != req->file) {
+ ctx->poll_multi_queue = true;
+ } else {
+ queue_num0 = blk_qc_t_to_queue_num(list_req->rw.kiocb.ki_cookie);
+ queue_num1 = blk_qc_t_to_queue_num(req->rw.kiocb.ki_cookie);
+ if (queue_num0 != queue_num1)
+ ctx->poll_multi_queue = true;
+ }
}
/*
@@ -2552,14 +2580,19 @@ static void io_iopoll_req_issued(struct io_kiocb *req, bool in_async)
else
list_add_tail(&req->inflight_entry, &ctx->iopoll_list);
- /*
- * If IORING_SETUP_SQPOLL is enabled, sqes are either handled in sq thread
- * task context or in io worker task context. If current task context is
- * sq thread, we don't need to check whether should wake up sq thread.
- */
- if (in_async && (ctx->flags & IORING_SETUP_SQPOLL) &&
- wq_has_sleeper(&ctx->sq_data->wait))
- wake_up(&ctx->sq_data->wait);
+ if (unlikely(in_async)) {
+ /*
+ * If IORING_SETUP_SQPOLL is enabled, sqes are either handle
+ * in sq thread task context or in io worker task context. If
+ * current task context is sq thread, we don't need to check
+ * whether should wake up sq thread.
+ */
+ if ((ctx->flags & IORING_SETUP_SQPOLL) &&
+ wq_has_sleeper(&ctx->sq_data->wait))
+ wake_up(&ctx->sq_data->wait);
+
+ mutex_unlock(&ctx->uring_lock);
+ }
}
static inline void io_state_file_put(struct io_submit_state *state)
@@ -2616,7 +2649,7 @@ static bool __io_file_supports_async(struct file *file, int rw)
return true;
return false;
}
- if (S_ISCHR(mode) || S_ISSOCK(mode))
+ if (S_ISSOCK(mode))
return true;
if (S_ISREG(mode)) {
if (IS_ENABLED(CONFIG_BLOCK) &&
@@ -2749,12 +2782,12 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
if (req->flags & REQ_F_CUR_POS)
req->file->f_pos = kiocb->ki_pos;
- if (ret >= 0 && kiocb->ki_complete == io_complete_rw)
+ if (ret >= 0 && check_reissue)
__io_complete_rw(req, ret, 0, issue_flags);
else
io_rw_done(kiocb, ret);
- if (check_reissue && req->flags & REQ_F_REISSUE) {
+ if (check_reissue && (req->flags & REQ_F_REISSUE)) {
req->flags &= ~REQ_F_REISSUE;
if (io_resubmit_prep(req)) {
req_ref_get(req);
@@ -2762,7 +2795,7 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
} else {
int cflags = 0;
- req_set_fail_links(req);
+ req_set_fail(req);
if (req->flags & REQ_F_BUFFER_SELECTED)
cflags = io_put_rw_kbuf(req);
__io_req_complete(req, issue_flags, ret, cflags);
@@ -3233,7 +3266,7 @@ static bool io_rw_should_retry(struct io_kiocb *req)
return true;
}
-static int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter)
+static inline int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter)
{
if (req->file->f_op->read_iter)
return call_read_iter(req->file, &req->rw.kiocb, iter);
@@ -3448,6 +3481,10 @@ static int io_renameat_prep(struct io_kiocb *req,
struct io_rename *ren = &req->rename;
const char __user *oldf, *newf;
+ if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+ return -EINVAL;
+ if (sqe->ioprio || sqe->buf_index)
+ return -EINVAL;
if (unlikely(req->flags & REQ_F_FIXED_FILE))
return -EBADF;
@@ -3484,7 +3521,7 @@ static int io_renameat(struct io_kiocb *req, unsigned int issue_flags)
req->flags &= ~REQ_F_NEED_CLEANUP;
if (ret < 0)
- req_set_fail_links(req);
+ req_set_fail(req);
io_req_complete(req, ret);
return 0;
}
@@ -3495,6 +3532,10 @@ static int io_unlinkat_prep(struct io_kiocb *req,
struct io_unlink *un = &req->unlink;
const char __user *fname;
+ if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+ return -EINVAL;
+ if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index)
+ return -EINVAL;
if (unlikely(req->flags & REQ_F_FIXED_FILE))
return -EBADF;
@@ -3528,7 +3569,7 @@ static int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags)
req->flags &= ~REQ_F_NEED_CLEANUP;
if (ret < 0)
- req_set_fail_links(req);
+ req_set_fail(req);
io_req_complete(req, ret);
return 0;
}
@@ -3565,7 +3606,7 @@ static int io_shutdown(struct io_kiocb *req, unsigned int issue_flags)
ret = __sys_shutdown_sock(sock, req->shutdown.how);
if (ret < 0)
- req_set_fail_links(req);
+ req_set_fail(req);
io_req_complete(req, ret);
return 0;
#else
@@ -3576,7 +3617,7 @@ static int io_shutdown(struct io_kiocb *req, unsigned int issue_flags)
static int __io_splice_prep(struct io_kiocb *req,
const struct io_uring_sqe *sqe)
{
- struct io_splice* sp = &req->splice;
+ struct io_splice *sp = &req->splice;
unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL;
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
@@ -3623,14 +3664,14 @@ static int io_tee(struct io_kiocb *req, unsigned int issue_flags)
req->flags &= ~REQ_F_NEED_CLEANUP;
if (ret != sp->len)
- req_set_fail_links(req);
+ req_set_fail(req);
io_req_complete(req, ret);
return 0;
}
static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
- struct io_splice* sp = &req->splice;
+ struct io_splice *sp = &req->splice;
sp->off_in = READ_ONCE(sqe->splice_off_in);
sp->off_out = READ_ONCE(sqe->off);
@@ -3660,7 +3701,7 @@ static int io_splice(struct io_kiocb *req, unsigned int issue_flags)
req->flags &= ~REQ_F_NEED_CLEANUP;
if (ret != sp->len)
- req_set_fail_links(req);
+ req_set_fail(req);
io_req_complete(req, ret);
return 0;
}
@@ -3713,7 +3754,7 @@ static int io_fsync(struct io_kiocb *req, unsigned int issue_flags)
end > 0 ? end : LLONG_MAX,
req->sync.flags & IORING_FSYNC_DATASYNC);
if (ret < 0)
- req_set_fail_links(req);
+ req_set_fail(req);
io_req_complete(req, ret);
return 0;
}
@@ -3742,7 +3783,7 @@ static int io_fallocate(struct io_kiocb *req, unsigned int issue_flags)
ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off,
req->sync.len);
if (ret < 0)
- req_set_fail_links(req);
+ req_set_fail(req);
io_req_complete(req, ret);
return 0;
}
@@ -3836,32 +3877,31 @@ static int io_openat2(struct io_kiocb *req, unsigned int issue_flags)
goto err;
file = do_filp_open(req->open.dfd, req->open.filename, &op);
- /* only retry if RESOLVE_CACHED wasn't already set by application */
- if ((!resolve_nonblock && (issue_flags & IO_URING_F_NONBLOCK)) &&
- file == ERR_PTR(-EAGAIN)) {
+ if (IS_ERR(file)) {
/*
- * We could hang on to this 'fd', but seems like marginal
- * gain for something that is now known to be a slower path.
- * So just put it, and we'll get a new one when we retry.
+ * We could hang on to this 'fd' on retrying, but seems like
+ * marginal gain for something that is now known to be a slower
+ * path. So just put it, and we'll get a new one when we retry.
*/
put_unused_fd(ret);
- return -EAGAIN;
- }
- if (IS_ERR(file)) {
- put_unused_fd(ret);
ret = PTR_ERR(file);
- } else {
- if ((issue_flags & IO_URING_F_NONBLOCK) && !nonblock_set)
- file->f_flags &= ~O_NONBLOCK;
- fsnotify_open(file);
- fd_install(ret, file);
+ /* only retry if RESOLVE_CACHED wasn't already set by application */
+ if (ret == -EAGAIN &&
+ (!resolve_nonblock && (issue_flags & IO_URING_F_NONBLOCK)))
+ return -EAGAIN;
+ goto err;
}
+
+ if ((issue_flags & IO_URING_F_NONBLOCK) && !nonblock_set)
+ file->f_flags &= ~O_NONBLOCK;
+ fsnotify_open(file);
+ fd_install(ret, file);
err:
putname(req->open.filename);
req->flags &= ~REQ_F_NEED_CLEANUP;
if (ret < 0)
- req_set_fail_links(req);
+ req_set_fail(req);
__io_req_complete(req, issue_flags, ret, 0);
return 0;
}
@@ -3933,7 +3973,7 @@ static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
if (head)
ret = __io_remove_buffers(ctx, head, p->bgid, p->nbufs);
if (ret < 0)
- req_set_fail_links(req);
+ req_set_fail(req);
/* complete before unlock, IOPOLL may need the lock */
__io_req_complete(req, issue_flags, ret, 0);
@@ -4024,7 +4064,7 @@ static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
__io_remove_buffers(ctx, head, p->bgid, -1U);
}
if (ret < 0)
- req_set_fail_links(req);
+ req_set_fail(req);
/* complete before unlock, IOPOLL may need the lock */
__io_req_complete(req, issue_flags, ret, 0);
io_ring_submit_unlock(ctx, !force_nonblock);
@@ -4070,7 +4110,7 @@ static int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags)
return -EAGAIN;
if (ret < 0)
- req_set_fail_links(req);
+ req_set_fail(req);
__io_req_complete(req, issue_flags, ret, 0);
return 0;
#else
@@ -4106,7 +4146,7 @@ static int io_madvise(struct io_kiocb *req, unsigned int issue_flags)
ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice);
if (ret < 0)
- req_set_fail_links(req);
+ req_set_fail(req);
io_req_complete(req, ret);
return 0;
#else
@@ -4145,7 +4185,7 @@ static int io_fadvise(struct io_kiocb *req, unsigned int issue_flags)
ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice);
if (ret < 0)
- req_set_fail_links(req);
+ req_set_fail(req);
__io_req_complete(req, issue_flags, ret, 0);
return 0;
}
@@ -4180,7 +4220,7 @@ static int io_statx(struct io_kiocb *req, unsigned int issue_flags)
ctx->buffer);
if (ret < 0)
- req_set_fail_links(req);
+ req_set_fail(req);
io_req_complete(req, ret);
return 0;
}
@@ -4238,7 +4278,7 @@ static int io_close(struct io_kiocb *req, unsigned int issue_flags)
ret = filp_close(file, current->files);
err:
if (ret < 0)
- req_set_fail_links(req);
+ req_set_fail(req);
if (file)
fput(file);
__io_req_complete(req, issue_flags, ret, 0);
@@ -4271,7 +4311,7 @@ static int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags)
ret = sync_file_range(req->file, req->sync.off, req->sync.len,
req->sync.flags);
if (ret < 0)
- req_set_fail_links(req);
+ req_set_fail(req);
io_req_complete(req, ret);
return 0;
}
@@ -4375,7 +4415,7 @@ static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
kfree(kmsg->free_iov);
req->flags &= ~REQ_F_NEED_CLEANUP;
if (ret < min_ret)
- req_set_fail_links(req);
+ req_set_fail(req);
__io_req_complete(req, issue_flags, ret, 0);
return 0;
}
@@ -4417,7 +4457,7 @@ static int io_send(struct io_kiocb *req, unsigned int issue_flags)
ret = -EINTR;
if (ret < min_ret)
- req_set_fail_links(req);
+ req_set_fail(req);
__io_req_complete(req, issue_flags, ret, 0);
return 0;
}
@@ -4612,7 +4652,7 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
kfree(kmsg->free_iov);
req->flags &= ~REQ_F_NEED_CLEANUP;
if (ret < min_ret || ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))))
- req_set_fail_links(req);
+ req_set_fail(req);
__io_req_complete(req, issue_flags, ret, cflags);
return 0;
}
@@ -4667,7 +4707,7 @@ out_free:
if (req->flags & REQ_F_BUFFER_SELECTED)
cflags = io_put_recv_kbuf(req);
if (ret < min_ret || ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))))
- req_set_fail_links(req);
+ req_set_fail(req);
__io_req_complete(req, issue_flags, ret, cflags);
return 0;
}
@@ -4706,7 +4746,7 @@ static int io_accept(struct io_kiocb *req, unsigned int issue_flags)
if (ret < 0) {
if (ret == -ERESTARTSYS)
ret = -EINTR;
- req_set_fail_links(req);
+ req_set_fail(req);
}
__io_req_complete(req, issue_flags, ret, 0);
return 0;
@@ -4770,7 +4810,7 @@ static int io_connect(struct io_kiocb *req, unsigned int issue_flags)
ret = -EINTR;
out:
if (ret < 0)
- req_set_fail_links(req);
+ req_set_fail(req);
__io_req_complete(req, issue_flags, ret, 0);
return 0;
}
@@ -5019,10 +5059,10 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
* Can't handle multishot for double wait for now, turn it
* into one-shot mode.
*/
- if (!(req->poll.events & EPOLLONESHOT))
- req->poll.events |= EPOLLONESHOT;
+ if (!(poll_one->events & EPOLLONESHOT))
+ poll_one->events |= EPOLLONESHOT;
/* double add on the same waitqueue head, ignore */
- if (poll->head == head)
+ if (poll_one->head == head)
return;
poll = kmalloc(sizeof(*poll), GFP_ATOMIC);
if (!poll) {
@@ -5059,7 +5099,7 @@ static void io_async_task_func(struct callback_head *cb)
struct async_poll *apoll = req->apoll;
struct io_ring_ctx *ctx = req->ctx;
- trace_io_uring_task_run(req->ctx, req->opcode, req->user_data);
+ trace_io_uring_task_run(req->ctx, req, req->opcode, req->user_data);
if (io_poll_rewait(req, &apoll->poll)) {
spin_unlock_irq(&ctx->completion_lock);
@@ -5138,50 +5178,51 @@ static __poll_t __io_arm_poll_handler(struct io_kiocb *req,
return mask;
}
-static bool io_arm_poll_handler(struct io_kiocb *req)
+enum {
+ IO_APOLL_OK,
+ IO_APOLL_ABORTED,
+ IO_APOLL_READY
+};
+
+static int io_arm_poll_handler(struct io_kiocb *req)
{
const struct io_op_def *def = &io_op_defs[req->opcode];
struct io_ring_ctx *ctx = req->ctx;
struct async_poll *apoll;
struct io_poll_table ipt;
- __poll_t mask, ret;
+ __poll_t ret, mask = EPOLLONESHOT | POLLERR | POLLPRI;
int rw;
if (!req->file || !file_can_poll(req->file))
- return false;
+ return IO_APOLL_ABORTED;
if (req->flags & REQ_F_POLLED)
- return false;
- if (def->pollin)
+ return IO_APOLL_ABORTED;
+ if (!def->pollin && !def->pollout)
+ return IO_APOLL_ABORTED;
+
+ if (def->pollin) {
rw = READ;
- else if (def->pollout)
+ mask |= POLLIN | POLLRDNORM;
+
+ /* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */
+ if ((req->opcode == IORING_OP_RECVMSG) &&
+ (req->sr_msg.msg_flags & MSG_ERRQUEUE))
+ mask &= ~POLLIN;
+ } else {
rw = WRITE;
- else
- return false;
+ mask |= POLLOUT | POLLWRNORM;
+ }
+
/* if we can't nonblock try, then no point in arming a poll handler */
if (!io_file_supports_async(req, rw))
- return false;
+ return IO_APOLL_ABORTED;
apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
if (unlikely(!apoll))
- return false;
+ return IO_APOLL_ABORTED;
apoll->double_poll = NULL;
-
- req->flags |= REQ_F_POLLED;
req->apoll = apoll;
-
- mask = EPOLLONESHOT;
- if (def->pollin)
- mask |= POLLIN | POLLRDNORM;
- if (def->pollout)
- mask |= POLLOUT | POLLWRNORM;
-
- /* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */
- if ((req->opcode == IORING_OP_RECVMSG) &&
- (req->sr_msg.msg_flags & MSG_ERRQUEUE))
- mask &= ~POLLIN;
-
- mask |= POLLERR | POLLPRI;
-
+ req->flags |= REQ_F_POLLED;
ipt.pt._qproc = io_async_queue_proc;
ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask,
@@ -5189,12 +5230,14 @@ static bool io_arm_poll_handler(struct io_kiocb *req)
if (ret || ipt.error) {
io_poll_remove_double(req);
spin_unlock_irq(&ctx->completion_lock);
- return false;
+ if (ret)
+ return IO_APOLL_READY;
+ return IO_APOLL_ABORTED;
}
spin_unlock_irq(&ctx->completion_lock);
- trace_io_uring_poll_arm(ctx, req->opcode, req->user_data, mask,
- apoll->poll.events);
- return true;
+ trace_io_uring_poll_arm(ctx, req, req->opcode, req->user_data,
+ mask, apoll->poll.events);
+ return IO_APOLL_OK;
}
static bool __io_poll_remove_one(struct io_kiocb *req,
@@ -5241,7 +5284,7 @@ static bool io_poll_remove_one(struct io_kiocb *req)
if (do_complete) {
io_cqring_fill_event(req->ctx, req->user_data, -ECANCELED, 0);
io_commit_cqring(req->ctx);
- req_set_fail_links(req);
+ req_set_fail(req);
io_put_req_deferred(req, 1);
}
@@ -5252,7 +5295,7 @@ static bool io_poll_remove_one(struct io_kiocb *req)
* Returns true if we found and killed one or more poll requests
*/
static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk,
- struct files_struct *files)
+ bool cancel_all)
{
struct hlist_node *tmp;
struct io_kiocb *req;
@@ -5264,7 +5307,7 @@ static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk,
list = &ctx->cancel_hash[i];
hlist_for_each_entry_safe(req, tmp, list, hash_node) {
- if (io_match_task(req, tsk, files))
+ if (io_match_task(req, tsk, cancel_all))
posted += io_poll_remove_one(req);
}
}
@@ -5451,7 +5494,7 @@ static int io_poll_update(struct io_kiocb *req, unsigned int issue_flags)
err:
if (ret < 0) {
spin_unlock_irq(&ctx->completion_lock);
- req_set_fail_links(req);
+ req_set_fail(req);
io_req_complete(req, ret);
return 0;
}
@@ -5471,7 +5514,7 @@ err:
if (!completing) {
ret = io_poll_add(preq, issue_flags);
if (ret < 0) {
- req_set_fail_links(preq);
+ req_set_fail(preq);
io_req_complete(preq, ret);
}
}
@@ -5496,7 +5539,7 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
spin_unlock_irqrestore(&ctx->completion_lock, flags);
io_cqring_ev_posted(ctx);
- req_set_fail_links(req);
+ req_set_fail(req);
io_put_req(req);
return HRTIMER_NORESTART;
}
@@ -5532,7 +5575,7 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
if (IS_ERR(req))
return PTR_ERR(req);
- req_set_fail_links(req);
+ req_set_fail(req);
io_cqring_fill_event(ctx, req->user_data, -ECANCELED, 0);
io_put_req_deferred(req, 1);
return 0;
@@ -5611,7 +5654,7 @@ static int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags)
spin_unlock_irq(&ctx->completion_lock);
io_cqring_ev_posted(ctx);
if (ret < 0)
- req_set_fail_links(req);
+ req_set_fail(req);
io_put_req(req);
return 0;
}
@@ -5634,6 +5677,8 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
return -EINVAL;
req->timeout.off = off;
+ if (unlikely(off && !req->ctx->off_timeout_used))
+ req->ctx->off_timeout_used = true;
if (!req->async_data && io_alloc_async_data(req))
return -ENOMEM;
@@ -5764,7 +5809,7 @@ done:
io_cqring_ev_posted(ctx);
if (ret < 0)
- req_set_fail_links(req);
+ req_set_fail(req);
}
static int io_async_cancel_prep(struct io_kiocb *req,
@@ -5821,7 +5866,7 @@ done:
io_cqring_ev_posted(ctx);
if (ret < 0)
- req_set_fail_links(req);
+ req_set_fail(req);
io_put_req(req);
return 0;
}
@@ -5863,7 +5908,7 @@ static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
mutex_unlock(&ctx->uring_lock);
if (ret < 0)
- req_set_fail_links(req);
+ req_set_fail(req);
__io_req_complete(req, issue_flags, ret, 0);
return 0;
}
@@ -5974,48 +6019,69 @@ static int io_req_prep_async(struct io_kiocb *req)
static u32 io_get_sequence(struct io_kiocb *req)
{
- struct io_kiocb *pos;
- struct io_ring_ctx *ctx = req->ctx;
- u32 total_submitted, nr_reqs = 0;
+ u32 seq = req->ctx->cached_sq_head;
- io_for_each_link(pos, req)
- nr_reqs++;
-
- total_submitted = ctx->cached_sq_head - ctx->cached_sq_dropped;
- return total_submitted - nr_reqs;
+ /* need original cached_sq_head, but it was increased for each req */
+ io_for_each_link(req, req)
+ seq--;
+ return seq;
}
-static int io_req_defer(struct io_kiocb *req)
+static bool io_drain_req(struct io_kiocb *req)
{
+ struct io_kiocb *pos;
struct io_ring_ctx *ctx = req->ctx;
struct io_defer_entry *de;
int ret;
u32 seq;
+ /*
+ * If we need to drain a request in the middle of a link, drain the
+ * head request and the next request/link after the current link.
+ * Considering sequential execution of links, IOSQE_IO_DRAIN will be
+ * maintained for every request of our link.
+ */
+ if (ctx->drain_next) {
+ req->flags |= REQ_F_IO_DRAIN;
+ ctx->drain_next = false;
+ }
+ /* not interested in head, start from the first linked */
+ io_for_each_link(pos, req->link) {
+ if (pos->flags & REQ_F_IO_DRAIN) {
+ ctx->drain_next = true;
+ req->flags |= REQ_F_IO_DRAIN;
+ break;
+ }
+ }
+
/* Still need defer if there is pending req in defer list. */
if (likely(list_empty_careful(&ctx->defer_list) &&
- !(req->flags & REQ_F_IO_DRAIN)))
- return 0;
+ !(req->flags & REQ_F_IO_DRAIN))) {
+ ctx->drain_active = false;
+ return false;
+ }
seq = io_get_sequence(req);
/* Still a chance to pass the sequence check */
if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list))
- return 0;
+ return false;
ret = io_req_prep_async(req);
if (ret)
return ret;
io_prep_async_link(req);
de = kmalloc(sizeof(*de), GFP_KERNEL);
- if (!de)
- return -ENOMEM;
+ if (!de) {
+ io_req_complete_failed(req, ret);
+ return true;
+ }
spin_lock_irq(&ctx->completion_lock);
if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) {
spin_unlock_irq(&ctx->completion_lock);
kfree(de);
io_queue_async_work(req);
- return -EIOCBQUEUED;
+ return true;
}
trace_io_uring_defer(ctx, req, req->user_data);
@@ -6023,7 +6089,7 @@ static int io_req_defer(struct io_kiocb *req)
de->seq = seq;
list_add_tail(&de->list, &ctx->defer_list);
spin_unlock_irq(&ctx->completion_lock);
- return -EIOCBQUEUED;
+ return true;
}
static void io_clean_op(struct io_kiocb *req)
@@ -6040,7 +6106,6 @@ static void io_clean_op(struct io_kiocb *req)
kfree(req->sr_msg.kbuf);
break;
}
- req->flags &= ~REQ_F_BUFFER_SELECTED;
}
if (req->flags & REQ_F_NEED_CLEANUP) {
@@ -6052,8 +6117,8 @@ static void io_clean_op(struct io_kiocb *req)
case IORING_OP_WRITE_FIXED:
case IORING_OP_WRITE: {
struct io_async_rw *io = req->async_data;
- if (io->free_iovec)
- kfree(io->free_iovec);
+
+ kfree(io->free_iovec);
break;
}
case IORING_OP_RECVMSG:
@@ -6081,7 +6146,6 @@ static void io_clean_op(struct io_kiocb *req)
putname(req->unlink.filename);
break;
}
- req->flags &= ~REQ_F_NEED_CLEANUP;
}
if ((req->flags & REQ_F_POLLED) && req->apoll) {
kfree(req->apoll->double_poll);
@@ -6092,8 +6156,11 @@ static void io_clean_op(struct io_kiocb *req)
struct io_uring_task *tctx = req->task->io_uring;
atomic_dec(&tctx->inflight_tracked);
- req->flags &= ~REQ_F_INFLIGHT;
}
+ if (req->flags & REQ_F_CREDS)
+ put_cred(req->creds);
+
+ req->flags &= ~IO_REQ_CLEAN_FLAGS;
}
static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
@@ -6102,8 +6169,8 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
const struct cred *creds = NULL;
int ret;
- if (req->work.creds && req->work.creds != current_cred())
- creds = override_creds(req->work.creds);
+ if ((req->flags & REQ_F_CREDS) && req->creds != current_cred())
+ creds = override_creds(req->creds);
switch (req->opcode) {
case IORING_OP_NOP:
@@ -6213,23 +6280,11 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
if (creds)
revert_creds(creds);
-
if (ret)
return ret;
-
/* If the op doesn't have a file, we're not polling for it */
- if ((ctx->flags & IORING_SETUP_IOPOLL) && req->file) {
- const bool in_async = io_wq_current_is_worker();
-
- /* workqueue context doesn't hold uring_lock, grab it now */
- if (in_async)
- mutex_lock(&ctx->uring_lock);
-
- io_iopoll_req_issued(req, in_async);
-
- if (in_async)
- mutex_unlock(&ctx->uring_lock);
- }
+ if ((ctx->flags & IORING_SETUP_IOPOLL) && req->file)
+ io_iopoll_req_issued(req);
return 0;
}
@@ -6411,6 +6466,7 @@ static void __io_queue_sqe(struct io_kiocb *req)
struct io_kiocb *linked_timeout = io_prep_linked_timeout(req);
int ret;
+issue_sqe:
ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER);
/*
@@ -6425,17 +6481,21 @@ static void __io_queue_sqe(struct io_kiocb *req)
cs->reqs[cs->nr++] = req;
if (cs->nr == ARRAY_SIZE(cs->reqs))
- io_submit_flush_completions(cs, ctx);
+ io_submit_flush_completions(ctx);
} else {
io_put_req(req);
}
} else if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) {
- if (!io_arm_poll_handler(req)) {
+ switch (io_arm_poll_handler(req)) {
+ case IO_APOLL_READY:
+ goto issue_sqe;
+ case IO_APOLL_ABORTED:
/*
* Queued up for async execution, worker will release
* submit reference when the iocb is actually submitted.
*/
io_queue_async_work(req);
+ break;
}
} else {
io_req_complete_failed(req, ret);
@@ -6444,23 +6504,20 @@ static void __io_queue_sqe(struct io_kiocb *req)
io_queue_linked_timeout(linked_timeout);
}
-static void io_queue_sqe(struct io_kiocb *req)
+static inline void io_queue_sqe(struct io_kiocb *req)
{
- int ret;
+ if (unlikely(req->ctx->drain_active) && io_drain_req(req))
+ return;
- ret = io_req_defer(req);
- if (ret) {
- if (ret != -EIOCBQUEUED) {
-fail_req:
- io_req_complete_failed(req, ret);
- }
- } else if (req->flags & REQ_F_FORCE_ASYNC) {
- ret = io_req_prep_async(req);
- if (unlikely(ret))
- goto fail_req;
- io_queue_async_work(req);
- } else {
+ if (likely(!(req->flags & REQ_F_FORCE_ASYNC))) {
__io_queue_sqe(req);
+ } else {
+ int ret = io_req_prep_async(req);
+
+ if (unlikely(ret))
+ io_req_complete_failed(req, ret);
+ else
+ io_queue_async_work(req);
}
}
@@ -6473,7 +6530,7 @@ static inline bool io_check_restriction(struct io_ring_ctx *ctx,
struct io_kiocb *req,
unsigned int sqe_flags)
{
- if (!ctx->restricted)
+ if (likely(!ctx->restricted))
return true;
if (!test_bit(req->opcode, ctx->restrictions.sqe_op))
@@ -6501,35 +6558,33 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
/* same numerical values with corresponding REQ_F_*, safe to copy */
req->flags = sqe_flags = READ_ONCE(sqe->flags);
req->user_data = READ_ONCE(sqe->user_data);
- req->async_data = NULL;
req->file = NULL;
- req->ctx = ctx;
- req->link = NULL;
req->fixed_rsrc_refs = NULL;
/* one is dropped after submission, the other at completion */
atomic_set(&req->refs, 2);
req->task = current;
- req->result = 0;
- req->work.creds = NULL;
/* enforce forwards compatibility on users */
if (unlikely(sqe_flags & ~SQE_VALID_FLAGS))
return -EINVAL;
if (unlikely(req->opcode >= IORING_OP_LAST))
return -EINVAL;
- if (unlikely(!io_check_restriction(ctx, req, sqe_flags)))
+ if (!io_check_restriction(ctx, req, sqe_flags))
return -EACCES;
if ((sqe_flags & IOSQE_BUFFER_SELECT) &&
!io_op_defs[req->opcode].buffer_select)
return -EOPNOTSUPP;
+ if (unlikely(sqe_flags & IOSQE_IO_DRAIN))
+ ctx->drain_active = true;
personality = READ_ONCE(sqe->personality);
if (personality) {
- req->work.creds = xa_load(&ctx->personalities, personality);
- if (!req->work.creds)
+ req->creds = xa_load(&ctx->personalities, personality);
+ if (!req->creds)
return -EINVAL;
- get_cred(req->work.creds);
+ get_cred(req->creds);
+ req->flags |= REQ_F_CREDS;
}
state = &ctx->submit_state;
@@ -6566,20 +6621,22 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
fail_req:
if (link->head) {
/* fail even hard links since we don't submit */
- link->head->flags |= REQ_F_FAIL_LINK;
+ req_set_fail(link->head);
io_req_complete_failed(link->head, -ECANCELED);
link->head = NULL;
}
io_req_complete_failed(req, ret);
return ret;
}
+
ret = io_req_prep(req, sqe);
if (unlikely(ret))
goto fail_req;
/* don't need @sqe from now on */
- trace_io_uring_submit_sqe(ctx, req->opcode, req->user_data,
- true, ctx->flags & IORING_SETUP_SQPOLL);
+ trace_io_uring_submit_sqe(ctx, req, req->opcode, req->user_data,
+ req->flags, true,
+ ctx->flags & IORING_SETUP_SQPOLL);
/*
* If we already have a head request, queue this one for async
@@ -6591,17 +6648,6 @@ fail_req:
if (link->head) {
struct io_kiocb *head = link->head;
- /*
- * Taking sequential execution of a link, draining both sides
- * of the link also fullfils IOSQE_IO_DRAIN semantics for all
- * requests in the link. So, it drains the head and the
- * next after the link request. The last one is done via
- * drain_next flag to persist the effect across calls.
- */
- if (req->flags & REQ_F_IO_DRAIN) {
- head->flags |= REQ_F_IO_DRAIN;
- ctx->drain_next = 1;
- }
ret = io_req_prep_async(req);
if (unlikely(ret))
goto fail_req;
@@ -6611,14 +6657,10 @@ fail_req:
/* last request of a link, enqueue the link */
if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) {
- io_queue_sqe(head);
link->head = NULL;
+ io_queue_sqe(head);
}
} else {
- if (unlikely(ctx->drain_next)) {
- req->flags |= REQ_F_IO_DRAIN;
- ctx->drain_next = 0;
- }
if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
link->head = req;
link->last = req;
@@ -6639,7 +6681,7 @@ static void io_submit_state_end(struct io_submit_state *state,
if (state->link.head)
io_queue_sqe(state->link.head);
if (state->comp.nr)
- io_submit_flush_completions(&state->comp, ctx);
+ io_submit_flush_completions(ctx);
if (state->plug_started)
blk_finish_plug(&state->plug);
io_state_file_put(state);
@@ -6670,7 +6712,7 @@ static void io_commit_sqring(struct io_ring_ctx *ctx)
}
/*
- * Fetch an sqe, if one is available. Note that sqe_ptr will point to memory
+ * Fetch an sqe, if one is available. Note this returns a pointer to memory
* that is mapped by userspace. This means that care needs to be taken to
* ensure that reads are stable, as we cannot rely on userspace always
* being a good citizen. If members of the sqe are validated and then later
@@ -6679,8 +6721,8 @@ static void io_commit_sqring(struct io_ring_ctx *ctx)
*/
static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx)
{
- u32 *sq_array = ctx->sq_array;
- unsigned head;
+ unsigned head, mask = ctx->sq_entries - 1;
+ unsigned sq_idx = ctx->cached_sq_head++ & mask;
/*
* The cached sq head (or cq tail) serves two purposes:
@@ -6690,28 +6732,36 @@ static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx)
* 2) allows the kernel side to track the head on its own, even
* though the application is the one updating it.
*/
- head = READ_ONCE(sq_array[ctx->cached_sq_head++ & ctx->sq_mask]);
+ head = READ_ONCE(ctx->sq_array[sq_idx]);
if (likely(head < ctx->sq_entries))
return &ctx->sq_sqes[head];
/* drop invalid entries */
- ctx->cached_sq_dropped++;
- WRITE_ONCE(ctx->rings->sq_dropped, ctx->cached_sq_dropped);
+ ctx->cq_extra--;
+ WRITE_ONCE(ctx->rings->sq_dropped,
+ READ_ONCE(ctx->rings->sq_dropped) + 1);
return NULL;
}
static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
{
+ struct io_uring_task *tctx;
int submitted = 0;
/* make sure SQ entry isn't read before tail */
nr = min3(nr, ctx->sq_entries, io_sqring_entries(ctx));
-
if (!percpu_ref_tryget_many(&ctx->refs, nr))
return -EAGAIN;
- percpu_counter_add(&current->io_uring->inflight, nr);
- refcount_add(nr, &current->usage);
+ tctx = current->io_uring;
+ tctx->cached_refs -= nr;
+ if (unlikely(tctx->cached_refs < 0)) {
+ unsigned int refill = -tctx->cached_refs + IO_TCTX_REFS_CACHE_NR;
+
+ percpu_counter_add(&tctx->inflight, refill);
+ refcount_add(refill, &current->usage);
+ tctx->cached_refs += refill;
+ }
io_submit_state_start(&ctx->submit_state, nr);
while (submitted < nr) {
@@ -6737,12 +6787,10 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
if (unlikely(submitted != nr)) {
int ref_used = (submitted == -EAGAIN) ? 0 : submitted;
- struct io_uring_task *tctx = current->io_uring;
int unused = nr - ref_used;
+ current->io_uring->cached_refs += unused;
percpu_ref_put_many(&ctx->refs, unused);
- percpu_counter_sub(&tctx->inflight, unused);
- put_task_struct_many(current, unused);
}
io_submit_state_end(&ctx->submit_state, ctx);
@@ -6752,6 +6800,11 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
return submitted;
}
+static inline bool io_sqd_events_pending(struct io_sq_data *sqd)
+{
+ return READ_ONCE(sqd->state);
+}
+
static inline void io_ring_set_wakeup_flag(struct io_ring_ctx *ctx)
{
/* Tell userspace we may need a wakeup call */
@@ -6774,11 +6827,15 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
to_submit = io_sqring_entries(ctx);
/* if we're handling multiple rings, cap submit size for fairness */
- if (cap_entries && to_submit > 8)
- to_submit = 8;
+ if (cap_entries && to_submit > IORING_SQPOLL_CAP_ENTRIES_VALUE)
+ to_submit = IORING_SQPOLL_CAP_ENTRIES_VALUE;
if (!list_empty(&ctx->iopoll_list) || to_submit) {
unsigned nr_events = 0;
+ const struct cred *creds = NULL;
+
+ if (ctx->sq_creds != current_cred())
+ creds = override_creds(ctx->sq_creds);
mutex_lock(&ctx->uring_lock);
if (!list_empty(&ctx->iopoll_list))
@@ -6792,10 +6849,12 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
!(ctx->flags & IORING_SETUP_R_DISABLED))
ret = io_submit_sqes(ctx, to_submit);
mutex_unlock(&ctx->uring_lock);
- }
- if (!io_sqring_full(ctx) && wq_has_sleeper(&ctx->sqo_sq_wait))
- wake_up(&ctx->sqo_sq_wait);
+ if (to_submit && wq_has_sleeper(&ctx->sqo_sq_wait))
+ wake_up(&ctx->sqo_sq_wait);
+ if (creds)
+ revert_creds(creds);
+ }
return ret;
}
@@ -6810,6 +6869,22 @@ static void io_sqd_update_thread_idle(struct io_sq_data *sqd)
sqd->sq_thread_idle = sq_thread_idle;
}
+static bool io_sqd_handle_event(struct io_sq_data *sqd)
+{
+ bool did_sig = false;
+ struct ksignal ksig;
+
+ if (test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state) ||
+ signal_pending(current)) {
+ mutex_unlock(&sqd->lock);
+ if (signal_pending(current))
+ did_sig = get_signal(&ksig);
+ cond_resched();
+ mutex_lock(&sqd->lock);
+ }
+ return did_sig || test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
+}
+
static int io_sq_thread(void *data)
{
struct io_sq_data *sqd = data;
@@ -6828,48 +6903,26 @@ static int io_sq_thread(void *data)
current->flags |= PF_NO_SETAFFINITY;
mutex_lock(&sqd->lock);
- /* a user may had exited before the thread started */
- io_run_task_work_head(&sqd->park_task_work);
-
- while (!test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state)) {
- int ret;
- bool cap_entries, sqt_spin, needs_sched;
-
- if (test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state) ||
- signal_pending(current)) {
- bool did_sig = false;
-
- mutex_unlock(&sqd->lock);
- if (signal_pending(current)) {
- struct ksignal ksig;
+ while (1) {
+ bool cap_entries, sqt_spin = false;
- did_sig = get_signal(&ksig);
- }
- cond_resched();
- mutex_lock(&sqd->lock);
- io_run_task_work();
- io_run_task_work_head(&sqd->park_task_work);
- if (did_sig)
+ if (io_sqd_events_pending(sqd) || signal_pending(current)) {
+ if (io_sqd_handle_event(sqd))
break;
timeout = jiffies + sqd->sq_thread_idle;
- continue;
}
- sqt_spin = false;
+
cap_entries = !list_is_singular(&sqd->ctx_list);
list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
- const struct cred *creds = NULL;
+ int ret = __io_sq_thread(ctx, cap_entries);
- if (ctx->sq_creds != current_cred())
- creds = override_creds(ctx->sq_creds);
- ret = __io_sq_thread(ctx, cap_entries);
- if (creds)
- revert_creds(creds);
if (!sqt_spin && (ret > 0 || !list_empty(&ctx->iopoll_list)))
sqt_spin = true;
}
+ if (io_run_task_work())
+ sqt_spin = true;
if (sqt_spin || !time_after(jiffies, timeout)) {
- io_run_task_work();
cond_resched();
if (sqt_spin)
timeout = jiffies + sqd->sq_thread_idle;
@@ -6877,12 +6930,12 @@ static int io_sq_thread(void *data)
}
prepare_to_wait(&sqd->wait, &wait, TASK_INTERRUPTIBLE);
- if (!test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state)) {
- list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
- io_ring_set_wakeup_flag(ctx);
+ if (!io_sqd_events_pending(sqd) && !current->task_works) {
+ bool needs_sched = true;
- needs_sched = true;
list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
+ io_ring_set_wakeup_flag(ctx);
+
if ((ctx->flags & IORING_SETUP_IOPOLL) &&
!list_empty_careful(&ctx->iopoll_list)) {
needs_sched = false;
@@ -6904,16 +6957,14 @@ static int io_sq_thread(void *data)
}
finish_wait(&sqd->wait, &wait);
- io_run_task_work_head(&sqd->park_task_work);
timeout = jiffies + sqd->sq_thread_idle;
}
- io_uring_cancel_sqpoll(sqd);
+ io_uring_cancel_generic(true, sqd);
sqd->thread = NULL;
list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
io_ring_set_wakeup_flag(ctx);
io_run_task_work();
- io_run_task_work_head(&sqd->park_task_work);
mutex_unlock(&sqd->lock);
complete(&sqd->exited);
@@ -6950,7 +7001,7 @@ static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
* Cannot safely flush overflowed CQEs from here, ensure we wake up
* the task, and the next invocation will do it.
*/
- if (io_should_wake(iowq) || test_bit(0, &iowq->ctx->cq_check_overflow))
+ if (io_should_wake(iowq) || test_bit(0, &iowq->ctx->check_cq_overflow))
return autoremove_wake_function(curr, mode, wake_flags, key);
return -1;
}
@@ -6978,7 +7029,7 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
if (ret || io_should_wake(iowq))
return ret;
/* let the caller flush overflows, retry */
- if (test_bit(0, &ctx->cq_check_overflow))
+ if (test_bit(0, &ctx->check_cq_overflow))
return 1;
*timeout = schedule_timeout(*timeout);
@@ -7043,10 +7094,10 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
ret = -EBUSY;
break;
}
- prepare_to_wait_exclusive(&ctx->wait, &iowq.wq,
+ prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq,
TASK_INTERRUPTIBLE);
ret = io_cqring_wait_schedule(ctx, &iowq, &timeout);
- finish_wait(&ctx->wait, &iowq.wq);
+ finish_wait(&ctx->cq_wait, &iowq.wq);
cond_resched();
} while (ret > 0);
@@ -7055,14 +7106,36 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
}
-static void io_free_file_tables(struct io_file_table *table, unsigned nr_files)
+static void io_free_page_table(void **table, size_t size)
{
- unsigned i, nr_tables = DIV_ROUND_UP(nr_files, IORING_MAX_FILES_TABLE);
+ unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
for (i = 0; i < nr_tables; i++)
- kfree(table->files[i]);
- kfree(table->files);
- table->files = NULL;
+ kfree(table[i]);
+ kfree(table);
+}
+
+static void **io_alloc_page_table(size_t size)
+{
+ unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
+ size_t init_size = size;
+ void **table;
+
+ table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL);
+ if (!table)
+ return NULL;
+
+ for (i = 0; i < nr_tables; i++) {
+ unsigned int this_size = min_t(size_t, size, PAGE_SIZE);
+
+ table[i] = kzalloc(this_size, GFP_KERNEL);
+ if (!table[i]) {
+ io_free_page_table(table, init_size);
+ return NULL;
+ }
+ size -= this_size;
+ }
+ return table;
}
static inline void io_rsrc_ref_lock(struct io_ring_ctx *ctx)
@@ -7151,33 +7224,77 @@ static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, struct io_ring_ctx *ct
return ret;
}
+static u64 *io_get_tag_slot(struct io_rsrc_data *data, unsigned int idx)
+{
+ unsigned int off = idx & IO_RSRC_TAG_TABLE_MASK;
+ unsigned int table_idx = idx >> IO_RSRC_TAG_TABLE_SHIFT;
+
+ return &data->tags[table_idx][off];
+}
+
static void io_rsrc_data_free(struct io_rsrc_data *data)
{
- kvfree(data->tags);
+ size_t size = data->nr * sizeof(data->tags[0][0]);
+
+ if (data->tags)
+ io_free_page_table((void **)data->tags, size);
kfree(data);
}
-static struct io_rsrc_data *io_rsrc_data_alloc(struct io_ring_ctx *ctx,
- rsrc_put_fn *do_put,
- unsigned nr)
+static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, rsrc_put_fn *do_put,
+ u64 __user *utags, unsigned nr,
+ struct io_rsrc_data **pdata)
{
struct io_rsrc_data *data;
+ int ret = -ENOMEM;
+ unsigned i;
data = kzalloc(sizeof(*data), GFP_KERNEL);
if (!data)
- return NULL;
-
- data->tags = kvcalloc(nr, sizeof(*data->tags), GFP_KERNEL);
+ return -ENOMEM;
+ data->tags = (u64 **)io_alloc_page_table(nr * sizeof(data->tags[0][0]));
if (!data->tags) {
kfree(data);
- return NULL;
+ return -ENOMEM;
}
- atomic_set(&data->refs, 1);
+ data->nr = nr;
data->ctx = ctx;
data->do_put = do_put;
+ if (utags) {
+ ret = -EFAULT;
+ for (i = 0; i < nr; i++) {
+ u64 *tag_slot = io_get_tag_slot(data, i);
+
+ if (copy_from_user(tag_slot, &utags[i],
+ sizeof(*tag_slot)))
+ goto fail;
+ }
+ }
+
+ atomic_set(&data->refs, 1);
init_completion(&data->done);
- return data;
+ *pdata = data;
+ return 0;
+fail:
+ io_rsrc_data_free(data);
+ return ret;
+}
+
+static bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files)
+{
+ size_t size = nr_files * sizeof(struct io_fixed_file);
+
+ table->files = (struct io_fixed_file **)io_alloc_page_table(size);
+ return !!table->files;
+}
+
+static void io_free_file_tables(struct io_file_table *table, unsigned nr_files)
+{
+ size_t size = nr_files * sizeof(struct io_fixed_file);
+
+ io_free_page_table((void **)table->files, size);
+ table->files = NULL;
}
static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
@@ -7441,31 +7558,6 @@ static int io_sqe_files_scm(struct io_ring_ctx *ctx)
}
#endif
-static bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files)
-{
- unsigned i, nr_tables = DIV_ROUND_UP(nr_files, IORING_MAX_FILES_TABLE);
-
- table->files = kcalloc(nr_tables, sizeof(*table->files), GFP_KERNEL);
- if (!table->files)
- return false;
-
- for (i = 0; i < nr_tables; i++) {
- unsigned int this_files = min(nr_files, IORING_MAX_FILES_TABLE);
-
- table->files[i] = kcalloc(this_files, sizeof(*table->files[i]),
- GFP_KERNEL);
- if (!table->files[i])
- break;
- nr_files -= this_files;
- }
-
- if (i == nr_tables)
- return true;
-
- io_free_file_tables(table, nr_tables * IORING_MAX_FILES_TABLE);
- return false;
-}
-
static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
{
struct file *file = prsrc->file;
@@ -7540,14 +7632,13 @@ static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
if (prsrc->tag) {
bool lock_ring = ctx->flags & IORING_SETUP_IOPOLL;
- unsigned long flags;
io_ring_submit_lock(ctx, lock_ring);
- spin_lock_irqsave(&ctx->completion_lock, flags);
+ spin_lock_irq(&ctx->completion_lock);
io_cqring_fill_event(ctx, prsrc->tag, 0, 0);
ctx->cq_extra++;
io_commit_cqring(ctx);
- spin_unlock_irqrestore(&ctx->completion_lock, flags);
+ spin_unlock_irq(&ctx->completion_lock);
io_cqring_ev_posted(ctx);
io_ring_submit_unlock(ctx, lock_ring);
}
@@ -7629,7 +7720,6 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
struct file *file;
int fd, ret;
unsigned i;
- struct io_rsrc_data *file_data;
if (ctx->file_data)
return -EBUSY;
@@ -7640,27 +7730,24 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
ret = io_rsrc_node_switch_start(ctx);
if (ret)
return ret;
+ ret = io_rsrc_data_alloc(ctx, io_rsrc_file_put, tags, nr_args,
+ &ctx->file_data);
+ if (ret)
+ return ret;
- file_data = io_rsrc_data_alloc(ctx, io_rsrc_file_put, nr_args);
- if (!file_data)
- return -ENOMEM;
- ctx->file_data = file_data;
ret = -ENOMEM;
if (!io_alloc_file_tables(&ctx->file_table, nr_args))
goto out_free;
for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
- u64 tag = 0;
-
- if ((tags && copy_from_user(&tag, &tags[i], sizeof(tag))) ||
- copy_from_user(&fd, &fds[i], sizeof(fd))) {
+ if (copy_from_user(&fd, &fds[i], sizeof(fd))) {
ret = -EFAULT;
goto out_fput;
}
/* allow sparse sets */
if (fd == -1) {
ret = -EINVAL;
- if (unlikely(tag))
+ if (unlikely(*io_get_tag_slot(ctx->file_data, i)))
goto out_fput;
continue;
}
@@ -7681,7 +7768,6 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
fput(file);
goto out_fput;
}
- ctx->file_data->tags[i] = tag;
io_fixed_file_set(io_fixed_file_slot(&ctx->file_table, i), file);
}
@@ -7759,7 +7845,7 @@ static int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
if (!prsrc)
return -ENOMEM;
- prsrc->tag = data->tags[idx];
+ prsrc->tag = *io_get_tag_slot(data, idx);
prsrc->rsrc = rsrc;
list_add(&prsrc->list, &node->rsrc_list);
return 0;
@@ -7829,7 +7915,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
err = -EBADF;
break;
}
- data->tags[up->offset + done] = tag;
+ *io_get_tag_slot(data, up->offset + done) = tag;
io_fixed_file_set(file_slot, file);
err = io_sqe_file_register(ctx, file, i);
if (err) {
@@ -7887,7 +7973,7 @@ static int io_uring_alloc_task_context(struct task_struct *task,
struct io_uring_task *tctx;
int ret;
- tctx = kmalloc(sizeof(*tctx), GFP_KERNEL);
+ tctx = kzalloc(sizeof(*tctx), GFP_KERNEL);
if (unlikely(!tctx))
return -ENOMEM;
@@ -7907,13 +7993,11 @@ static int io_uring_alloc_task_context(struct task_struct *task,
xa_init(&tctx->xa);
init_waitqueue_head(&tctx->wait);
- tctx->last = NULL;
atomic_set(&tctx->in_idle, 0);
atomic_set(&tctx->inflight_tracked, 0);
task->io_uring = tctx;
spin_lock_init(&tctx->task_lock);
INIT_WQ_LIST(&tctx->task_list);
- tctx->task_state = 0;
init_task_work(&tctx->task_work, tctx_task_work);
return 0;
}
@@ -7924,6 +8008,7 @@ void __io_uring_free(struct task_struct *tsk)
WARN_ON_ONCE(!xa_empty(&tctx->xa));
WARN_ON_ONCE(tctx->io_wq);
+ WARN_ON_ONCE(tctx->cached_refs);
percpu_counter_destroy(&tctx->inflight);
kfree(tctx);
@@ -8228,6 +8313,7 @@ static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
{
int i, ret;
+ imu->acct_pages = 0;
for (i = 0; i < nr_pages; i++) {
if (!PageCompound(pages[i])) {
imu->acct_pages++;
@@ -8299,6 +8385,8 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
for (i = 0; i < nr_pages; i++) {
struct vm_area_struct *vma = vmas[i];
+ if (vma_is_shmem(vma))
+ continue;
if (vma->vm_file &&
!is_file_hugepages(vma->vm_file)) {
ret = -EOPNOTSUPP;
@@ -8396,9 +8484,9 @@ static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
ret = io_rsrc_node_switch_start(ctx);
if (ret)
return ret;
- data = io_rsrc_data_alloc(ctx, io_rsrc_buf_put, nr_args);
- if (!data)
- return -ENOMEM;
+ ret = io_rsrc_data_alloc(ctx, io_rsrc_buf_put, tags, nr_args, &data);
+ if (ret)
+ return ret;
ret = io_buffers_map_alloc(ctx, nr_args);
if (ret) {
io_rsrc_data_free(data);
@@ -8406,19 +8494,13 @@ static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
}
for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) {
- u64 tag = 0;
-
- if (tags && copy_from_user(&tag, &tags[i], sizeof(tag))) {
- ret = -EFAULT;
- break;
- }
ret = io_copy_iov(ctx, &iov, arg, i);
if (ret)
break;
ret = io_buffer_validate(&iov);
if (ret)
break;
- if (!iov.iov_base && tag) {
+ if (!iov.iov_base && *io_get_tag_slot(data, i)) {
ret = -EINVAL;
break;
}
@@ -8427,7 +8509,6 @@ static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
&last_hpage);
if (ret)
break;
- data->tags[i] = tag;
}
WARN_ON_ONCE(ctx->buf_data);
@@ -8492,7 +8573,7 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
}
ctx->user_bufs[i] = imu;
- ctx->buf_data->tags[offset] = tag;
+ *io_get_tag_slot(ctx->buf_data, offset) = tag;
}
if (needs_switch)
@@ -8514,6 +8595,7 @@ static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg)
ctx->cq_ev_fd = eventfd_ctx_fdget(fd);
if (IS_ERR(ctx->cq_ev_fd)) {
int ret = PTR_ERR(ctx->cq_ev_fd);
+
ctx->cq_ev_fd = NULL;
return ret;
}
@@ -8637,7 +8719,7 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait)
struct io_ring_ctx *ctx = file->private_data;
__poll_t mask = 0;
- poll_wait(file, &ctx->cq_wait, wait);
+ poll_wait(file, &ctx->poll_wait, wait);
/*
* synchronizes with barrier from wq_has_sleeper call in
* io_commit_cqring
@@ -8659,7 +8741,7 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait)
* Users may get EPOLLIN meanwhile seeing nothing in cqring, this
* pushs them to do the flush.
*/
- if (io_cqring_events(ctx) || test_bit(0, &ctx->cq_check_overflow))
+ if (io_cqring_events(ctx) || test_bit(0, &ctx->check_cq_overflow))
mask |= EPOLLIN | EPOLLRDNORM;
return mask;
@@ -8707,7 +8789,7 @@ static void io_tctx_exit_cb(struct callback_head *cb)
* node. It'll be removed by the end of cancellation, just ignore it.
*/
if (!atomic_read(&tctx->in_idle))
- io_uring_del_task_file((unsigned long)work->ctx);
+ io_uring_del_tctx_node((unsigned long)work->ctx);
complete(&work->completion);
}
@@ -8733,7 +8815,7 @@ static void io_ring_exit_work(struct work_struct *work)
* as nobody else will be looking for them.
*/
do {
- io_uring_try_cancel_requests(ctx, NULL, NULL);
+ io_uring_try_cancel_requests(ctx, NULL, true);
if (ctx->sq_data) {
struct io_sq_data *sqd = ctx->sq_data;
struct task_struct *tsk;
@@ -8784,14 +8866,14 @@ static void io_ring_exit_work(struct work_struct *work)
/* Returns true if we found and killed one or more timeouts */
static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk,
- struct files_struct *files)
+ bool cancel_all)
{
struct io_kiocb *req, *tmp;
int canceled = 0;
spin_lock_irq(&ctx->completion_lock);
list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) {
- if (io_match_task(req, tsk, files)) {
+ if (io_match_task(req, tsk, cancel_all)) {
io_kill_timeout(req, -ECANCELED);
canceled++;
}
@@ -8817,8 +8899,8 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
io_unregister_personality(ctx, index);
mutex_unlock(&ctx->uring_lock);
- io_kill_timeouts(ctx, NULL, NULL);
- io_poll_remove_all(ctx, NULL, NULL);
+ io_kill_timeouts(ctx, NULL, true);
+ io_poll_remove_all(ctx, NULL, true);
/* if we failed setting up the ctx, we might not have any rings */
io_iopoll_try_reap_events(ctx);
@@ -8844,7 +8926,7 @@ static int io_uring_release(struct inode *inode, struct file *file)
struct io_task_cancel {
struct task_struct *task;
- struct files_struct *files;
+ bool all;
};
static bool io_cancel_task_cb(struct io_wq_work *work, void *data)
@@ -8853,30 +8935,29 @@ static bool io_cancel_task_cb(struct io_wq_work *work, void *data)
struct io_task_cancel *cancel = data;
bool ret;
- if (cancel->files && (req->flags & REQ_F_LINK_TIMEOUT)) {
+ if (!cancel->all && (req->flags & REQ_F_LINK_TIMEOUT)) {
unsigned long flags;
struct io_ring_ctx *ctx = req->ctx;
/* protect against races with linked timeouts */
spin_lock_irqsave(&ctx->completion_lock, flags);
- ret = io_match_task(req, cancel->task, cancel->files);
+ ret = io_match_task(req, cancel->task, cancel->all);
spin_unlock_irqrestore(&ctx->completion_lock, flags);
} else {
- ret = io_match_task(req, cancel->task, cancel->files);
+ ret = io_match_task(req, cancel->task, cancel->all);
}
return ret;
}
static bool io_cancel_defer_files(struct io_ring_ctx *ctx,
- struct task_struct *task,
- struct files_struct *files)
+ struct task_struct *task, bool cancel_all)
{
struct io_defer_entry *de;
LIST_HEAD(list);
spin_lock_irq(&ctx->completion_lock);
list_for_each_entry_reverse(de, &ctx->defer_list, list) {
- if (io_match_task(de->req, task, files)) {
+ if (io_match_task(de->req, task, cancel_all)) {
list_cut_position(&list, &ctx->defer_list, &de->list);
break;
}
@@ -8920,9 +9001,9 @@ static bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx)
static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
struct task_struct *task,
- struct files_struct *files)
+ bool cancel_all)
{
- struct io_task_cancel cancel = { .task = task, .files = files, };
+ struct io_task_cancel cancel = { .task = task, .all = cancel_all, };
struct io_uring_task *tctx = task ? task->io_uring : NULL;
while (1) {
@@ -8942,7 +9023,7 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
}
/* SQPOLL thread does its own polling */
- if ((!(ctx->flags & IORING_SETUP_SQPOLL) && !files) ||
+ if ((!(ctx->flags & IORING_SETUP_SQPOLL) && cancel_all) ||
(ctx->sq_data && ctx->sq_data->thread == current)) {
while (!list_empty_careful(&ctx->iopoll_list)) {
io_iopoll_try_reap_events(ctx);
@@ -8950,10 +9031,11 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
}
}
- ret |= io_cancel_defer_files(ctx, task, files);
- ret |= io_poll_remove_all(ctx, task, files);
- ret |= io_kill_timeouts(ctx, task, files);
- ret |= io_run_task_work();
+ ret |= io_cancel_defer_files(ctx, task, cancel_all);
+ ret |= io_poll_remove_all(ctx, task, cancel_all);
+ ret |= io_kill_timeouts(ctx, task, cancel_all);
+ if (task)
+ ret |= io_run_task_work();
ret |= io_run_ctx_fallback(ctx);
if (!ret)
break;
@@ -8961,7 +9043,7 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
}
}
-static int __io_uring_add_task_file(struct io_ring_ctx *ctx)
+static int __io_uring_add_tctx_node(struct io_ring_ctx *ctx)
{
struct io_uring_task *tctx = current->io_uring;
struct io_tctx_node *node;
@@ -8998,19 +9080,19 @@ static int __io_uring_add_task_file(struct io_ring_ctx *ctx)
/*
* Note that this task has used io_uring. We use it for cancelation purposes.
*/
-static inline int io_uring_add_task_file(struct io_ring_ctx *ctx)
+static inline int io_uring_add_tctx_node(struct io_ring_ctx *ctx)
{
struct io_uring_task *tctx = current->io_uring;
if (likely(tctx && tctx->last == ctx))
return 0;
- return __io_uring_add_task_file(ctx);
+ return __io_uring_add_tctx_node(ctx);
}
/*
* Remove this io_uring_file -> task mapping.
*/
-static void io_uring_del_task_file(unsigned long index)
+static void io_uring_del_tctx_node(unsigned long index)
{
struct io_uring_task *tctx = current->io_uring;
struct io_tctx_node *node;
@@ -9035,14 +9117,19 @@ static void io_uring_del_task_file(unsigned long index)
static void io_uring_clean_tctx(struct io_uring_task *tctx)
{
+ struct io_wq *wq = tctx->io_wq;
struct io_tctx_node *node;
unsigned long index;
xa_for_each(&tctx->xa, index, node)
- io_uring_del_task_file(index);
- if (tctx->io_wq) {
- io_wq_put_and_exit(tctx->io_wq);
+ io_uring_del_tctx_node(index);
+ if (wq) {
+ /*
+ * Must be after io_uring_del_task_file() (removes nodes under
+ * uring_lock) to avoid race with io_uring_try_cancel_iowq().
+ */
tctx->io_wq = NULL;
+ io_wq_put_and_exit(wq);
}
}
@@ -9053,93 +9140,83 @@ static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked)
return percpu_counter_sum(&tctx->inflight);
}
-static void io_uring_try_cancel(struct files_struct *files)
+static void io_uring_drop_tctx_refs(struct task_struct *task)
{
- struct io_uring_task *tctx = current->io_uring;
- struct io_tctx_node *node;
- unsigned long index;
-
- xa_for_each(&tctx->xa, index, node) {
- struct io_ring_ctx *ctx = node->ctx;
+ struct io_uring_task *tctx = task->io_uring;
+ unsigned int refs = tctx->cached_refs;
- /* sqpoll task will cancel all its requests */
- if (!ctx->sq_data)
- io_uring_try_cancel_requests(ctx, current, files);
- }
+ tctx->cached_refs = 0;
+ percpu_counter_sub(&tctx->inflight, refs);
+ put_task_struct_many(task, refs);
}
-/* should only be called by SQPOLL task */
-static void io_uring_cancel_sqpoll(struct io_sq_data *sqd)
+/*
+ * Find any io_uring ctx that this task has registered or done IO on, and cancel
+ * requests. @sqd should be not-null IIF it's an SQPOLL thread cancellation.
+ */
+static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd)
{
struct io_uring_task *tctx = current->io_uring;
struct io_ring_ctx *ctx;
s64 inflight;
DEFINE_WAIT(wait);
+ WARN_ON_ONCE(sqd && sqd->thread != current);
+
if (!current->io_uring)
return;
- WARN_ON_ONCE(!sqd || sqd->thread != current);
+ if (tctx->io_wq)
+ io_wq_exit_start(tctx->io_wq);
+ io_uring_drop_tctx_refs(current);
atomic_inc(&tctx->in_idle);
do {
/* read completions before cancelations */
- inflight = tctx_inflight(tctx, false);
+ inflight = tctx_inflight(tctx, !cancel_all);
if (!inflight)
break;
- list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
- io_uring_try_cancel_requests(ctx, current, NULL);
- prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE);
- /*
- * If we've seen completions, retry without waiting. This
- * avoids a race where a completion comes in before we did
- * prepare_to_wait().
- */
- if (inflight == tctx_inflight(tctx, false))
- schedule();
- finish_wait(&tctx->wait, &wait);
- } while (1);
- atomic_dec(&tctx->in_idle);
-}
+ if (!sqd) {
+ struct io_tctx_node *node;
+ unsigned long index;
-/*
- * Find any io_uring fd that this task has registered or done IO on, and cancel
- * requests.
- */
-void __io_uring_cancel(struct files_struct *files)
-{
- struct io_uring_task *tctx = current->io_uring;
- DEFINE_WAIT(wait);
- s64 inflight;
+ xa_for_each(&tctx->xa, index, node) {
+ /* sqpoll task will cancel all its requests */
+ if (node->ctx->sq_data)
+ continue;
+ io_uring_try_cancel_requests(node->ctx, current,
+ cancel_all);
+ }
+ } else {
+ list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
+ io_uring_try_cancel_requests(ctx, current,
+ cancel_all);
+ }
- /* make sure overflow events are dropped */
- atomic_inc(&tctx->in_idle);
- do {
- /* read completions before cancelations */
- inflight = tctx_inflight(tctx, !!files);
- if (!inflight)
- break;
- io_uring_try_cancel(files);
prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE);
-
/*
* If we've seen completions, retry without waiting. This
* avoids a race where a completion comes in before we did
* prepare_to_wait().
*/
- if (inflight == tctx_inflight(tctx, !!files))
+ if (inflight == tctx_inflight(tctx, !cancel_all))
schedule();
finish_wait(&tctx->wait, &wait);
} while (1);
atomic_dec(&tctx->in_idle);
io_uring_clean_tctx(tctx);
- if (!files) {
+ if (cancel_all) {
/* for exec all current's requests should be gone, kill tctx */
__io_uring_free(current);
}
}
+void __io_uring_cancel(struct files_struct *files)
+{
+ io_uring_cancel_generic(!files, NULL);
+}
+
static void *io_uring_validate_mmap_request(struct file *file,
loff_t pgoff, size_t sz)
{
@@ -9300,9 +9377,8 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
io_cqring_overflow_flush(ctx, false);
ret = -EOWNERDEAD;
- if (unlikely(ctx->sq_data->thread == NULL)) {
+ if (unlikely(ctx->sq_data->thread == NULL))
goto out;
- }
if (flags & IORING_ENTER_SQ_WAKEUP)
wake_up(&ctx->sq_data->wait);
if (flags & IORING_ENTER_SQ_WAIT) {
@@ -9312,7 +9388,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
}
submitted = to_submit;
} else if (to_submit) {
- ret = io_uring_add_task_file(ctx);
+ ret = io_uring_add_tctx_node(ctx);
if (unlikely(ret))
goto out;
mutex_lock(&ctx->uring_lock);
@@ -9496,8 +9572,6 @@ static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
rings->cq_ring_mask = p->cq_entries - 1;
rings->sq_ring_entries = p->sq_entries;
rings->cq_ring_entries = p->cq_entries;
- ctx->sq_mask = rings->sq_ring_mask;
- ctx->cq_mask = rings->cq_ring_mask;
size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
if (size == SIZE_MAX) {
@@ -9524,7 +9598,7 @@ static int io_uring_install_fd(struct io_ring_ctx *ctx, struct file *file)
if (fd < 0)
return fd;
- ret = io_uring_add_task_file(ctx);
+ ret = io_uring_add_tctx_node(ctx);
if (ret) {
put_unused_fd(fd);
return ret;
@@ -9659,7 +9733,8 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p,
IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL |
IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED |
- IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS;
+ IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS |
+ IORING_FEAT_RSRC_TAGS;
if (copy_to_user(params, p, sizeof(*p))) {
ret = -EFAULT;
@@ -9899,7 +9974,7 @@ static int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg,
}
static int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
- unsigned size)
+ unsigned size, unsigned type)
{
struct io_uring_rsrc_update2 up;
@@ -9907,13 +9982,13 @@ static int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
return -EINVAL;
if (copy_from_user(&up, arg, sizeof(up)))
return -EFAULT;
- if (!up.nr)
+ if (!up.nr || up.resv)
return -EINVAL;
- return __io_register_rsrc_update(ctx, up.type, &up, up.nr);
+ return __io_register_rsrc_update(ctx, type, &up, up.nr);
}
static int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
- unsigned int size)
+ unsigned int size, unsigned int type)
{
struct io_uring_rsrc_register rr;
@@ -9924,10 +9999,10 @@ static int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
memset(&rr, 0, sizeof(rr));
if (copy_from_user(&rr, arg, size))
return -EFAULT;
- if (!rr.nr)
+ if (!rr.nr || rr.resv || rr.resv2)
return -EINVAL;
- switch (rr.type) {
+ switch (type) {
case IORING_RSRC_FILE:
return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data),
rr.nr, u64_to_user_ptr(rr.tags));
@@ -9938,6 +10013,43 @@ static int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
return -EINVAL;
}
+static int io_register_iowq_aff(struct io_ring_ctx *ctx, void __user *arg,
+ unsigned len)
+{
+ struct io_uring_task *tctx = current->io_uring;
+ cpumask_var_t new_mask;
+ int ret;
+
+ if (!tctx || !tctx->io_wq)
+ return -EINVAL;
+
+ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
+ return -ENOMEM;
+
+ cpumask_clear(new_mask);
+ if (len > cpumask_size())
+ len = cpumask_size();
+
+ if (copy_from_user(new_mask, arg, len)) {
+ free_cpumask_var(new_mask);
+ return -EFAULT;
+ }
+
+ ret = io_wq_cpu_affinity(tctx->io_wq, new_mask);
+ free_cpumask_var(new_mask);
+ return ret;
+}
+
+static int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
+{
+ struct io_uring_task *tctx = current->io_uring;
+
+ if (!tctx || !tctx->io_wq)
+ return -EINVAL;
+
+ return io_wq_cpu_affinity(tctx->io_wq, NULL);
+}
+
static bool io_register_op_must_quiesce(int op)
{
switch (op) {
@@ -9949,8 +10061,12 @@ static bool io_register_op_must_quiesce(int op)
case IORING_REGISTER_PROBE:
case IORING_REGISTER_PERSONALITY:
case IORING_UNREGISTER_PERSONALITY:
- case IORING_REGISTER_RSRC:
- case IORING_REGISTER_RSRC_UPDATE:
+ case IORING_REGISTER_FILES2:
+ case IORING_REGISTER_FILES_UPDATE2:
+ case IORING_REGISTER_BUFFERS2:
+ case IORING_REGISTER_BUFFERS_UPDATE:
+ case IORING_REGISTER_IOWQ_AFF:
+ case IORING_UNREGISTER_IOWQ_AFF:
return false;
default:
return true;
@@ -10076,11 +10192,31 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
case IORING_REGISTER_RESTRICTIONS:
ret = io_register_restrictions(ctx, arg, nr_args);
break;
- case IORING_REGISTER_RSRC:
- ret = io_register_rsrc(ctx, arg, nr_args);
+ case IORING_REGISTER_FILES2:
+ ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE);
+ break;
+ case IORING_REGISTER_FILES_UPDATE2:
+ ret = io_register_rsrc_update(ctx, arg, nr_args,
+ IORING_RSRC_FILE);
break;
- case IORING_REGISTER_RSRC_UPDATE:
- ret = io_register_rsrc_update(ctx, arg, nr_args);
+ case IORING_REGISTER_BUFFERS2:
+ ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER);
+ break;
+ case IORING_REGISTER_BUFFERS_UPDATE:
+ ret = io_register_rsrc_update(ctx, arg, nr_args,
+ IORING_RSRC_BUFFER);
+ break;
+ case IORING_REGISTER_IOWQ_AFF:
+ ret = -EINVAL;
+ if (!arg || !nr_args)
+ break;
+ ret = io_register_iowq_aff(ctx, arg, nr_args);
+ break;
+ case IORING_UNREGISTER_IOWQ_AFF:
+ ret = -EINVAL;
+ if (arg || nr_args)
+ break;
+ ret = io_unregister_iowq_aff(ctx);
break;
default:
ret = -EINVAL;
@@ -10160,6 +10296,7 @@ static int __init io_uring_init(void)
BUILD_BUG_SQE_ELEM(28, __u32, splice_flags);
BUILD_BUG_SQE_ELEM(32, __u64, user_data);
BUILD_BUG_SQE_ELEM(40, __u16, buf_index);
+ BUILD_BUG_SQE_ELEM(40, __u16, buf_group);
BUILD_BUG_SQE_ELEM(42, __u16, personality);
BUILD_BUG_SQE_ELEM(44, __s32, splice_fd_in);
@@ -10172,6 +10309,7 @@ static int __init io_uring_init(void)
BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
BUILD_BUG_ON(__REQ_F_LAST_BIT >= 8 * sizeof(int));
+
req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC |
SLAB_ACCOUNT);
return 0;
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 9023717c5188..0065781935c7 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -640,31 +640,6 @@ out_no_page:
return status;
}
-int
-iomap_set_page_dirty(struct page *page)
-{
- struct address_space *mapping = page_mapping(page);
- int newly_dirty;
-
- if (unlikely(!mapping))
- return !TestSetPageDirty(page);
-
- /*
- * Lock out page's memcg migration to keep PageDirty
- * synchronized with per-memcg dirty page counters.
- */
- lock_page_memcg(page);
- newly_dirty = !TestSetPageDirty(page);
- if (newly_dirty)
- __set_page_dirty(page, mapping, 0);
- unlock_page_memcg(page);
-
- if (newly_dirty)
- __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
- return newly_dirty;
-}
-EXPORT_SYMBOL_GPL(iomap_set_page_dirty);
-
static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len,
size_t copied, struct page *page)
{
@@ -684,7 +659,7 @@ static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len,
if (unlikely(copied < len && !PageUptodate(page)))
return 0;
iomap_set_range_uptodate(page, offset_in_page(pos), len);
- iomap_set_page_dirty(page);
+ __set_page_dirty_nobuffers(page);
return copied;
}
diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c
index b9e6a7ec78be..eb2f8273e6f1 100644
--- a/fs/isofs/dir.c
+++ b/fs/isofs/dir.c
@@ -235,8 +235,6 @@ static int do_isofs_readdir(struct inode *inode, struct file *file,
break;
}
ctx->pos += de_len;
-
- continue;
}
if (bh)
brelse(bh);
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 63b526d44886..51d1eb2ffeb9 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -80,23 +80,15 @@ static inline void __buffer_relink_io(struct journal_head *jh)
}
/*
- * Try to release a checkpointed buffer from its transaction.
- * Returns 1 if we released it and 2 if we also released the
- * whole transaction.
+ * Check a checkpoint buffer could be release or not.
*
* Requires j_list_lock
*/
-static int __try_to_free_cp_buf(struct journal_head *jh)
+static inline bool __cp_buffer_busy(struct journal_head *jh)
{
- int ret = 0;
struct buffer_head *bh = jh2bh(jh);
- if (jh->b_transaction == NULL && !buffer_locked(bh) &&
- !buffer_dirty(bh) && !buffer_write_io_error(bh)) {
- JBUFFER_TRACE(jh, "remove from checkpoint list");
- ret = __jbd2_journal_remove_checkpoint(jh) + 1;
- }
- return ret;
+ return (jh->b_transaction || buffer_locked(bh) || buffer_dirty(bh));
}
/*
@@ -228,7 +220,6 @@ int jbd2_log_do_checkpoint(journal_t *journal)
* OK, we need to start writing disk blocks. Take one transaction
* and write it.
*/
- result = 0;
spin_lock(&journal->j_list_lock);
if (!journal->j_checkpoint_transactions)
goto out;
@@ -295,8 +286,6 @@ restart:
goto restart;
}
if (!buffer_dirty(bh)) {
- if (unlikely(buffer_write_io_error(bh)) && !result)
- result = -EIO;
BUFFER_TRACE(bh, "remove from checkpoint");
if (__jbd2_journal_remove_checkpoint(jh))
/* The transaction was released; we're done */
@@ -356,8 +345,6 @@ restart2:
spin_lock(&journal->j_list_lock);
goto restart2;
}
- if (unlikely(buffer_write_io_error(bh)) && !result)
- result = -EIO;
/*
* Now in whatever state the buffer currently is, we
@@ -369,10 +356,7 @@ restart2:
}
out:
spin_unlock(&journal->j_list_lock);
- if (result < 0)
- jbd2_journal_abort(journal, result);
- else
- result = jbd2_cleanup_journal_tail(journal);
+ result = jbd2_cleanup_journal_tail(journal);
return (result < 0) ? result : 0;
}
@@ -437,7 +421,6 @@ static int journal_clean_one_cp_list(struct journal_head *jh, bool destroy)
{
struct journal_head *last_jh;
struct journal_head *next_jh = jh;
- int ret;
if (!jh)
return 0;
@@ -446,13 +429,11 @@ static int journal_clean_one_cp_list(struct journal_head *jh, bool destroy)
do {
jh = next_jh;
next_jh = jh->b_cpnext;
- if (!destroy)
- ret = __try_to_free_cp_buf(jh);
- else
- ret = __jbd2_journal_remove_checkpoint(jh) + 1;
- if (!ret)
+
+ if (!destroy && __cp_buffer_busy(jh))
return 0;
- if (ret == 2)
+
+ if (__jbd2_journal_remove_checkpoint(jh))
return 1;
/*
* This function only frees up some memory
@@ -468,6 +449,137 @@ static int journal_clean_one_cp_list(struct journal_head *jh, bool destroy)
}
/*
+ * journal_shrink_one_cp_list
+ *
+ * Find 'nr_to_scan' written-back checkpoint buffers in the given list
+ * and try to release them. If the whole transaction is released, set
+ * the 'released' parameter. Return the number of released checkpointed
+ * buffers.
+ *
+ * Called with j_list_lock held.
+ */
+static unsigned long journal_shrink_one_cp_list(struct journal_head *jh,
+ unsigned long *nr_to_scan,
+ bool *released)
+{
+ struct journal_head *last_jh;
+ struct journal_head *next_jh = jh;
+ unsigned long nr_freed = 0;
+ int ret;
+
+ if (!jh || *nr_to_scan == 0)
+ return 0;
+
+ last_jh = jh->b_cpprev;
+ do {
+ jh = next_jh;
+ next_jh = jh->b_cpnext;
+
+ (*nr_to_scan)--;
+ if (__cp_buffer_busy(jh))
+ continue;
+
+ nr_freed++;
+ ret = __jbd2_journal_remove_checkpoint(jh);
+ if (ret) {
+ *released = true;
+ break;
+ }
+
+ if (need_resched())
+ break;
+ } while (jh != last_jh && *nr_to_scan);
+
+ return nr_freed;
+}
+
+/*
+ * jbd2_journal_shrink_checkpoint_list
+ *
+ * Find 'nr_to_scan' written-back checkpoint buffers in the journal
+ * and try to release them. Return the number of released checkpointed
+ * buffers.
+ *
+ * Called with j_list_lock held.
+ */
+unsigned long jbd2_journal_shrink_checkpoint_list(journal_t *journal,
+ unsigned long *nr_to_scan)
+{
+ transaction_t *transaction, *last_transaction, *next_transaction;
+ bool released;
+ tid_t first_tid = 0, last_tid = 0, next_tid = 0;
+ tid_t tid = 0;
+ unsigned long nr_freed = 0;
+ unsigned long nr_scanned = *nr_to_scan;
+
+again:
+ spin_lock(&journal->j_list_lock);
+ if (!journal->j_checkpoint_transactions) {
+ spin_unlock(&journal->j_list_lock);
+ goto out;
+ }
+
+ /*
+ * Get next shrink transaction, resume previous scan or start
+ * over again. If some others do checkpoint and drop transaction
+ * from the checkpoint list, we ignore saved j_shrink_transaction
+ * and start over unconditionally.
+ */
+ if (journal->j_shrink_transaction)
+ transaction = journal->j_shrink_transaction;
+ else
+ transaction = journal->j_checkpoint_transactions;
+
+ if (!first_tid)
+ first_tid = transaction->t_tid;
+ last_transaction = journal->j_checkpoint_transactions->t_cpprev;
+ next_transaction = transaction;
+ last_tid = last_transaction->t_tid;
+ do {
+ transaction = next_transaction;
+ next_transaction = transaction->t_cpnext;
+ tid = transaction->t_tid;
+ released = false;
+
+ nr_freed += journal_shrink_one_cp_list(transaction->t_checkpoint_list,
+ nr_to_scan, &released);
+ if (*nr_to_scan == 0)
+ break;
+ if (need_resched() || spin_needbreak(&journal->j_list_lock))
+ break;
+ if (released)
+ continue;
+
+ nr_freed += journal_shrink_one_cp_list(transaction->t_checkpoint_io_list,
+ nr_to_scan, &released);
+ if (*nr_to_scan == 0)
+ break;
+ if (need_resched() || spin_needbreak(&journal->j_list_lock))
+ break;
+ } while (transaction != last_transaction);
+
+ if (transaction != last_transaction) {
+ journal->j_shrink_transaction = next_transaction;
+ next_tid = next_transaction->t_tid;
+ } else {
+ journal->j_shrink_transaction = NULL;
+ next_tid = 0;
+ }
+
+ spin_unlock(&journal->j_list_lock);
+ cond_resched();
+
+ if (*nr_to_scan && next_tid)
+ goto again;
+out:
+ nr_scanned -= *nr_to_scan;
+ trace_jbd2_shrink_checkpoint_list(journal, first_tid, tid, last_tid,
+ nr_freed, nr_scanned, next_tid);
+
+ return nr_freed;
+}
+
+/*
* journal_clean_checkpoint_list
*
* Find all the written-back checkpoint buffers in the journal and release them.
@@ -564,24 +676,37 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
struct transaction_chp_stats_s *stats;
transaction_t *transaction;
journal_t *journal;
- int ret = 0;
+ struct buffer_head *bh = jh2bh(jh);
JBUFFER_TRACE(jh, "entry");
- if ((transaction = jh->b_cp_transaction) == NULL) {
+ transaction = jh->b_cp_transaction;
+ if (!transaction) {
JBUFFER_TRACE(jh, "not on transaction");
- goto out;
+ return 0;
}
journal = transaction->t_journal;
JBUFFER_TRACE(jh, "removing from transaction");
+
+ /*
+ * If we have failed to write the buffer out to disk, the filesystem
+ * may become inconsistent. We cannot abort the journal here since
+ * we hold j_list_lock and we have to be careful about races with
+ * jbd2_journal_destroy(). So mark the writeback IO error in the
+ * journal here and we abort the journal later from a better context.
+ */
+ if (buffer_write_io_error(bh))
+ set_bit(JBD2_CHECKPOINT_IO_ERROR, &journal->j_atomic_flags);
+
__buffer_unlink(jh);
jh->b_cp_transaction = NULL;
+ percpu_counter_dec(&journal->j_jh_shrink_count);
jbd2_journal_put_journal_head(jh);
- if (transaction->t_checkpoint_list != NULL ||
- transaction->t_checkpoint_io_list != NULL)
- goto out;
+ /* Is this transaction empty? */
+ if (transaction->t_checkpoint_list || transaction->t_checkpoint_io_list)
+ return 0;
/*
* There is one special case to worry about: if we have just pulled the
@@ -593,10 +718,12 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
* See the comment at the end of jbd2_journal_commit_transaction().
*/
if (transaction->t_state != T_FINISHED)
- goto out;
+ return 0;
- /* OK, that was the last buffer for the transaction: we can now
- safely remove this transaction from the log */
+ /*
+ * OK, that was the last buffer for the transaction, we can now
+ * safely remove this transaction from the log.
+ */
stats = &transaction->t_chp_stats;
if (stats->cs_chp_time)
stats->cs_chp_time = jbd2_time_diff(stats->cs_chp_time,
@@ -606,9 +733,7 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
__jbd2_journal_drop_transaction(journal, transaction);
jbd2_journal_free_transaction(transaction);
- ret = 1;
-out:
- return ret;
+ return 1;
}
/*
@@ -639,6 +764,7 @@ void __jbd2_journal_insert_checkpoint(struct journal_head *jh,
jh->b_cpnext->b_cpprev = jh;
}
transaction->t_checkpoint_list = jh;
+ percpu_counter_inc(&transaction->t_journal->j_jh_shrink_count);
}
/*
@@ -654,6 +780,8 @@ void __jbd2_journal_insert_checkpoint(struct journal_head *jh,
void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transaction)
{
assert_spin_locked(&journal->j_list_lock);
+
+ journal->j_shrink_transaction = NULL;
if (transaction->t_cpnext) {
transaction->t_cpnext->t_cpprev = transaction->t_cpprev;
transaction->t_cpprev->t_cpnext = transaction->t_cpnext;
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 2dc944442802..152880c298ca 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -934,10 +934,6 @@ int jbd2_fc_wait_bufs(journal_t *journal, int num_blks)
}
EXPORT_SYMBOL(jbd2_fc_wait_bufs);
-/*
- * Wait on fast commit buffers that were allocated by jbd2_fc_get_buf
- * for completion.
- */
int jbd2_fc_release_bufs(journal_t *journal)
{
struct buffer_head *bh;
@@ -945,10 +941,6 @@ int jbd2_fc_release_bufs(journal_t *journal)
j_fc_off = journal->j_fc_off;
- /*
- * Wait in reverse order to minimize chances of us being woken up before
- * all IOs have completed
- */
for (i = j_fc_off - 1; i >= 0; i--) {
bh = journal->j_fc_wbuf[i];
if (!bh)
@@ -1618,6 +1610,10 @@ int jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid,
if (is_journal_aborted(journal))
return -EIO;
+ if (test_bit(JBD2_CHECKPOINT_IO_ERROR, &journal->j_atomic_flags)) {
+ jbd2_journal_abort(journal, -EIO);
+ return -EIO;
+ }
BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
jbd_debug(1, "JBD2: updating superblock (start %lu, seq %u)\n",
@@ -1686,6 +1682,110 @@ static void jbd2_mark_journal_empty(journal_t *journal, int write_op)
write_unlock(&journal->j_state_lock);
}
+/**
+ * __jbd2_journal_erase() - Discard or zeroout journal blocks (excluding superblock)
+ * @journal: The journal to erase.
+ * @flags: A discard/zeroout request is sent for each physically contigous
+ * region of the journal. Either JBD2_JOURNAL_FLUSH_DISCARD or
+ * JBD2_JOURNAL_FLUSH_ZEROOUT must be set to determine which operation
+ * to perform.
+ *
+ * Note: JBD2_JOURNAL_FLUSH_ZEROOUT attempts to use hardware offload. Zeroes
+ * will be explicitly written if no hardware offload is available, see
+ * blkdev_issue_zeroout for more details.
+ */
+static int __jbd2_journal_erase(journal_t *journal, unsigned int flags)
+{
+ int err = 0;
+ unsigned long block, log_offset; /* logical */
+ unsigned long long phys_block, block_start, block_stop; /* physical */
+ loff_t byte_start, byte_stop, byte_count;
+ struct request_queue *q = bdev_get_queue(journal->j_dev);
+
+ /* flags must be set to either discard or zeroout */
+ if ((flags & ~JBD2_JOURNAL_FLUSH_VALID) || !flags ||
+ ((flags & JBD2_JOURNAL_FLUSH_DISCARD) &&
+ (flags & JBD2_JOURNAL_FLUSH_ZEROOUT)))
+ return -EINVAL;
+
+ if (!q)
+ return -ENXIO;
+
+ if ((flags & JBD2_JOURNAL_FLUSH_DISCARD) && !blk_queue_discard(q))
+ return -EOPNOTSUPP;
+
+ /*
+ * lookup block mapping and issue discard/zeroout for each
+ * contiguous region
+ */
+ log_offset = be32_to_cpu(journal->j_superblock->s_first);
+ block_start = ~0ULL;
+ for (block = log_offset; block < journal->j_total_len; block++) {
+ err = jbd2_journal_bmap(journal, block, &phys_block);
+ if (err) {
+ pr_err("JBD2: bad block at offset %lu", block);
+ return err;
+ }
+
+ if (block_start == ~0ULL) {
+ block_start = phys_block;
+ block_stop = block_start - 1;
+ }
+
+ /*
+ * last block not contiguous with current block,
+ * process last contiguous region and return to this block on
+ * next loop
+ */
+ if (phys_block != block_stop + 1) {
+ block--;
+ } else {
+ block_stop++;
+ /*
+ * if this isn't the last block of journal,
+ * no need to process now because next block may also
+ * be part of this contiguous region
+ */
+ if (block != journal->j_total_len - 1)
+ continue;
+ }
+
+ /*
+ * end of contiguous region or this is last block of journal,
+ * take care of the region
+ */
+ byte_start = block_start * journal->j_blocksize;
+ byte_stop = block_stop * journal->j_blocksize;
+ byte_count = (block_stop - block_start + 1) *
+ journal->j_blocksize;
+
+ truncate_inode_pages_range(journal->j_dev->bd_inode->i_mapping,
+ byte_start, byte_stop);
+
+ if (flags & JBD2_JOURNAL_FLUSH_DISCARD) {
+ err = blkdev_issue_discard(journal->j_dev,
+ byte_start >> SECTOR_SHIFT,
+ byte_count >> SECTOR_SHIFT,
+ GFP_NOFS, 0);
+ } else if (flags & JBD2_JOURNAL_FLUSH_ZEROOUT) {
+ err = blkdev_issue_zeroout(journal->j_dev,
+ byte_start >> SECTOR_SHIFT,
+ byte_count >> SECTOR_SHIFT,
+ GFP_NOFS, 0);
+ }
+
+ if (unlikely(err != 0)) {
+ pr_err("JBD2: (error %d) unable to wipe journal at physical blocks %llu - %llu",
+ err, block_start, block_stop);
+ return err;
+ }
+
+ /* reset start and stop after processing a region */
+ block_start = ~0ULL;
+ }
+
+ return blkdev_issue_flush(journal->j_dev);
+}
/**
* jbd2_journal_update_sb_errno() - Update error in the journal.
@@ -1951,6 +2051,93 @@ recovery_error:
}
/**
+ * jbd2_journal_shrink_scan()
+ *
+ * Scan the checkpointed buffer on the checkpoint list and release the
+ * journal_head.
+ */
+static unsigned long jbd2_journal_shrink_scan(struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ journal_t *journal = container_of(shrink, journal_t, j_shrinker);
+ unsigned long nr_to_scan = sc->nr_to_scan;
+ unsigned long nr_shrunk;
+ unsigned long count;
+
+ count = percpu_counter_read_positive(&journal->j_jh_shrink_count);
+ trace_jbd2_shrink_scan_enter(journal, sc->nr_to_scan, count);
+
+ nr_shrunk = jbd2_journal_shrink_checkpoint_list(journal, &nr_to_scan);
+
+ count = percpu_counter_read_positive(&journal->j_jh_shrink_count);
+ trace_jbd2_shrink_scan_exit(journal, nr_to_scan, nr_shrunk, count);
+
+ return nr_shrunk;
+}
+
+/**
+ * jbd2_journal_shrink_count()
+ *
+ * Count the number of checkpoint buffers on the checkpoint list.
+ */
+static unsigned long jbd2_journal_shrink_count(struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ journal_t *journal = container_of(shrink, journal_t, j_shrinker);
+ unsigned long count;
+
+ count = percpu_counter_read_positive(&journal->j_jh_shrink_count);
+ trace_jbd2_shrink_count(journal, sc->nr_to_scan, count);
+
+ return count;
+}
+
+/**
+ * jbd2_journal_register_shrinker()
+ * @journal: Journal to act on.
+ *
+ * Init a percpu counter to record the checkpointed buffers on the checkpoint
+ * list and register a shrinker to release their journal_head.
+ */
+int jbd2_journal_register_shrinker(journal_t *journal)
+{
+ int err;
+
+ journal->j_shrink_transaction = NULL;
+
+ err = percpu_counter_init(&journal->j_jh_shrink_count, 0, GFP_KERNEL);
+ if (err)
+ return err;
+
+ journal->j_shrinker.scan_objects = jbd2_journal_shrink_scan;
+ journal->j_shrinker.count_objects = jbd2_journal_shrink_count;
+ journal->j_shrinker.seeks = DEFAULT_SEEKS;
+ journal->j_shrinker.batch = journal->j_max_transaction_buffers;
+
+ err = register_shrinker(&journal->j_shrinker);
+ if (err) {
+ percpu_counter_destroy(&journal->j_jh_shrink_count);
+ return err;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(jbd2_journal_register_shrinker);
+
+/**
+ * jbd2_journal_unregister_shrinker()
+ * @journal: Journal to act on.
+ *
+ * Unregister the checkpointed buffer shrinker and destroy the percpu counter.
+ */
+void jbd2_journal_unregister_shrinker(journal_t *journal)
+{
+ percpu_counter_destroy(&journal->j_jh_shrink_count);
+ unregister_shrinker(&journal->j_shrinker);
+}
+EXPORT_SYMBOL(jbd2_journal_unregister_shrinker);
+
+/**
* jbd2_journal_destroy() - Release a journal_t structure.
* @journal: Journal to act on.
*
@@ -1995,6 +2182,16 @@ int jbd2_journal_destroy(journal_t *journal)
J_ASSERT(journal->j_checkpoint_transactions == NULL);
spin_unlock(&journal->j_list_lock);
+ /*
+ * OK, all checkpoint transactions have been checked, now check the
+ * write out io error flag and abort the journal if some buffer failed
+ * to write back to the original location, otherwise the filesystem
+ * may become inconsistent.
+ */
+ if (!is_journal_aborted(journal) &&
+ test_bit(JBD2_CHECKPOINT_IO_ERROR, &journal->j_atomic_flags))
+ jbd2_journal_abort(journal, -EIO);
+
if (journal->j_sb_buffer) {
if (!is_journal_aborted(journal)) {
mutex_lock_io(&journal->j_checkpoint_mutex);
@@ -2012,6 +2209,8 @@ int jbd2_journal_destroy(journal_t *journal)
brelse(journal->j_sb_buffer);
}
+ jbd2_journal_unregister_shrinker(journal);
+
if (journal->j_proc_entry)
jbd2_stats_proc_exit(journal);
iput(journal->j_inode);
@@ -2246,13 +2445,18 @@ EXPORT_SYMBOL(jbd2_journal_clear_features);
/**
* jbd2_journal_flush() - Flush journal
* @journal: Journal to act on.
+ * @flags: optional operation on the journal blocks after the flush (see below)
*
* Flush all data for a given journal to disk and empty the journal.
* Filesystems can use this when remounting readonly to ensure that
- * recovery does not need to happen on remount.
+ * recovery does not need to happen on remount. Optionally, a discard or zeroout
+ * can be issued on the journal blocks after flushing.
+ *
+ * flags:
+ * JBD2_JOURNAL_FLUSH_DISCARD: issues discards for the journal blocks
+ * JBD2_JOURNAL_FLUSH_ZEROOUT: issues zeroouts for the journal blocks
*/
-
-int jbd2_journal_flush(journal_t *journal)
+int jbd2_journal_flush(journal_t *journal, unsigned int flags)
{
int err = 0;
transaction_t *transaction = NULL;
@@ -2306,6 +2510,10 @@ int jbd2_journal_flush(journal_t *journal)
* commits of data to the journal will restore the current
* s_start value. */
jbd2_mark_journal_empty(journal, REQ_SYNC | REQ_FUA);
+
+ if (flags)
+ err = __jbd2_journal_erase(journal, flags);
+
mutex_unlock(&journal->j_checkpoint_mutex);
write_lock(&journal->j_state_lock);
J_ASSERT(!journal->j_running_transaction);
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index e8fc45fd751f..8804e126805f 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -2123,7 +2123,6 @@ int jbd2_journal_try_to_free_buffers(journal_t *journal, struct page *page)
{
struct buffer_head *head;
struct buffer_head *bh;
- bool has_write_io_error = false;
int ret = 0;
J_ASSERT(PageLocked(page));
@@ -2148,26 +2147,10 @@ int jbd2_journal_try_to_free_buffers(journal_t *journal, struct page *page)
jbd2_journal_put_journal_head(jh);
if (buffer_jbd(bh))
goto busy;
-
- /*
- * If we free a metadata buffer which has been failed to
- * write out, the jbd2 checkpoint procedure will not detect
- * this failure and may lead to filesystem inconsistency
- * after cleanup journal tail.
- */
- if (buffer_write_io_error(bh)) {
- pr_err("JBD2: Error while async write back metadata bh %llu.",
- (unsigned long long)bh->b_blocknr);
- has_write_io_error = true;
- }
} while ((bh = bh->b_this_page) != head);
ret = try_to_free_buffers(page);
-
busy:
- if (has_write_io_error)
- jbd2_journal_abort(journal, -EIO);
-
return ret;
}
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index b0eb9c85eea0..57ab424c05ff 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -357,6 +357,7 @@ static ssize_t jfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
}
const struct address_space_operations jfs_aops = {
+ .set_page_dirty = __set_page_dirty_buffers,
.readpage = jfs_readpage,
.readahead = jfs_readahead,
.writepage = jfs_writepage,
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index d73950fc3d57..26f2aa3586f9 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -17,12 +17,6 @@
#include "kernfs-internal.h"
-static const struct address_space_operations kernfs_aops = {
- .readpage = simple_readpage,
- .write_begin = simple_write_begin,
- .write_end = simple_write_end,
-};
-
static const struct inode_operations kernfs_iops = {
.permission = kernfs_iop_permission,
.setattr = kernfs_iop_setattr,
@@ -203,7 +197,7 @@ static void kernfs_init_inode(struct kernfs_node *kn, struct inode *inode)
{
kernfs_get(kn);
inode->i_private = kn;
- inode->i_mapping->a_ops = &kernfs_aops;
+ inode->i_mapping->a_ops = &ram_aops;
inode->i_op = &kernfs_iops;
inode->i_generation = kernfs_gen(kn);
diff --git a/fs/libfs.c b/fs/libfs.c
index e9b29c6ffccb..51b4de3b3447 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -512,7 +512,7 @@ int simple_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
}
EXPORT_SYMBOL(simple_setattr);
-int simple_readpage(struct file *file, struct page *page)
+static int simple_readpage(struct file *file, struct page *page)
{
clear_highpage(page);
flush_dcache_page(page);
@@ -520,7 +520,6 @@ int simple_readpage(struct file *file, struct page *page)
unlock_page(page);
return 0;
}
-EXPORT_SYMBOL(simple_readpage);
int simple_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
@@ -568,7 +567,7 @@ EXPORT_SYMBOL(simple_write_begin);
*
* Use *ONLY* with simple_readpage()
*/
-int simple_write_end(struct file *file, struct address_space *mapping,
+static int simple_write_end(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata)
{
@@ -597,7 +596,17 @@ int simple_write_end(struct file *file, struct address_space *mapping,
return copied;
}
-EXPORT_SYMBOL(simple_write_end);
+
+/*
+ * Provides ramfs-style behavior: data in the pagecache, but no writeback.
+ */
+const struct address_space_operations ram_aops = {
+ .readpage = simple_readpage,
+ .write_begin = simple_write_begin,
+ .write_end = simple_write_end,
+ .set_page_dirty = __set_page_dirty_no_writeback,
+};
+EXPORT_SYMBOL(ram_aops);
/*
* the inodes created here are not hashed. If you use iunique to generate
@@ -1162,22 +1171,6 @@ int noop_fsync(struct file *file, loff_t start, loff_t end, int datasync)
}
EXPORT_SYMBOL(noop_fsync);
-int noop_set_page_dirty(struct page *page)
-{
- /*
- * Unlike __set_page_dirty_no_writeback that handles dirty page
- * tracking in the page object, dax does all dirty tracking in
- * the inode address_space in response to mkwrite faults. In the
- * dax case we only need to worry about potentially dirty CPU
- * caches, not dirty page cache pages to write back.
- *
- * This callback is defined to prevent fallback to
- * __set_page_dirty_buffers() in set_page_dirty().
- */
- return 0;
-}
-EXPORT_SYMBOL_GPL(noop_set_page_dirty);
-
void noop_invalidatepage(struct page *page, unsigned int offset,
unsigned int length)
{
@@ -1208,19 +1201,10 @@ void kfree_link(void *p)
}
EXPORT_SYMBOL(kfree_link);
-/*
- * nop .set_page_dirty method so that people can use .page_mkwrite on
- * anon inodes.
- */
-static int anon_set_page_dirty(struct page *page)
-{
- return 0;
-};
-
struct inode *alloc_anon_inode(struct super_block *s)
{
static const struct address_space_operations anon_aops = {
- .set_page_dirty = anon_set_page_dirty,
+ .set_page_dirty = __set_page_dirty_no_writeback,
};
struct inode *inode = new_inode_pseudo(s);
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index a532a99bbe81..a71f1cf894b9 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -442,6 +442,7 @@ static sector_t minix_bmap(struct address_space *mapping, sector_t block)
}
static const struct address_space_operations minix_aops = {
+ .set_page_dirty = __set_page_dirty_buffers,
.readpage = minix_readpage,
.writepage = minix_writepage,
.write_begin = minix_write_begin,
diff --git a/fs/namespace.c b/fs/namespace.c
index c3f1a78ba369..ab4174a3c802 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -3464,9 +3464,10 @@ out_type:
return ret;
}
-#define FSMOUNT_VALID_FLAGS \
- (MOUNT_ATTR_RDONLY | MOUNT_ATTR_NOSUID | MOUNT_ATTR_NODEV | \
- MOUNT_ATTR_NOEXEC | MOUNT_ATTR__ATIME | MOUNT_ATTR_NODIRATIME)
+#define FSMOUNT_VALID_FLAGS \
+ (MOUNT_ATTR_RDONLY | MOUNT_ATTR_NOSUID | MOUNT_ATTR_NODEV | \
+ MOUNT_ATTR_NOEXEC | MOUNT_ATTR__ATIME | MOUNT_ATTR_NODIRATIME | \
+ MOUNT_ATTR_NOSYMFOLLOW)
#define MOUNT_SETATTR_VALID_FLAGS (FSMOUNT_VALID_FLAGS | MOUNT_ATTR_IDMAP)
@@ -3487,6 +3488,8 @@ static unsigned int attr_flags_to_mnt_flags(u64 attr_flags)
mnt_flags |= MNT_NOEXEC;
if (attr_flags & MOUNT_ATTR_NODIRATIME)
mnt_flags |= MNT_NODIRATIME;
+ if (attr_flags & MOUNT_ATTR_NOSYMFOLLOW)
+ mnt_flags |= MNT_NOSYMFOLLOW;
return mnt_flags;
}
diff --git a/fs/netfs/Kconfig b/fs/netfs/Kconfig
index 578112713703..b4db21022cb4 100644
--- a/fs/netfs/Kconfig
+++ b/fs/netfs/Kconfig
@@ -1,7 +1,7 @@
# SPDX-License-Identifier: GPL-2.0-only
config NETFS_SUPPORT
- tristate "Support for network filesystem high-level I/O"
+ tristate
help
This option enables support for network filesystems, including
helpers for high-level buffered I/O, abstracting out read
diff --git a/fs/netfs/read_helper.c b/fs/netfs/read_helper.c
index 193841d03de0..0b6cd3b8734c 100644
--- a/fs/netfs/read_helper.c
+++ b/fs/netfs/read_helper.c
@@ -1011,12 +1011,42 @@ out:
}
EXPORT_SYMBOL(netfs_readpage);
-static void netfs_clear_thp(struct page *page)
+/**
+ * netfs_skip_page_read - prep a page for writing without reading first
+ * @page: page being prepared
+ * @pos: starting position for the write
+ * @len: length of write
+ *
+ * In some cases, write_begin doesn't need to read at all:
+ * - full page write
+ * - write that lies in a page that is completely beyond EOF
+ * - write that covers the the page from start to EOF or beyond it
+ *
+ * If any of these criteria are met, then zero out the unwritten parts
+ * of the page and return true. Otherwise, return false.
+ */
+static bool netfs_skip_page_read(struct page *page, loff_t pos, size_t len)
{
- unsigned int i;
+ struct inode *inode = page->mapping->host;
+ loff_t i_size = i_size_read(inode);
+ size_t offset = offset_in_thp(page, pos);
+
+ /* Full page write */
+ if (offset == 0 && len >= thp_size(page))
+ return true;
+
+ /* pos beyond last page in the file */
+ if (pos - offset >= i_size)
+ goto zero_out;
+
+ /* Write that covers from the start of the page to EOF or beyond */
+ if (offset == 0 && (pos + len) >= i_size)
+ goto zero_out;
- for (i = 0; i < thp_nr_pages(page); i++)
- clear_highpage(page + i);
+ return false;
+zero_out:
+ zero_user_segments(page, 0, offset, offset + len, thp_size(page));
+ return true;
}
/**
@@ -1024,7 +1054,7 @@ static void netfs_clear_thp(struct page *page)
* @file: The file to read from
* @mapping: The mapping to read from
* @pos: File position at which the write will begin
- * @len: The length of the write in this page
+ * @len: The length of the write (may extend beyond the end of the page chosen)
* @flags: AOP_* flags
* @_page: Where to put the resultant page
* @_fsdata: Place for the netfs to store a cookie
@@ -1061,14 +1091,12 @@ int netfs_write_begin(struct file *file, struct address_space *mapping,
struct inode *inode = file_inode(file);
unsigned int debug_index = 0;
pgoff_t index = pos >> PAGE_SHIFT;
- int pos_in_page = pos & ~PAGE_MASK;
- loff_t size;
int ret;
DEFINE_READAHEAD(ractl, file, NULL, mapping, index);
retry:
- page = grab_cache_page_write_begin(mapping, index, 0);
+ page = grab_cache_page_write_begin(mapping, index, flags);
if (!page)
return -ENOMEM;
@@ -1090,13 +1118,8 @@ retry:
* within the cache granule containing the EOF, in which case we need
* to preload the granule.
*/
- size = i_size_read(inode);
if (!ops->is_cache_enabled(inode) &&
- ((pos_in_page == 0 && len == thp_size(page)) ||
- (pos >= size) ||
- (pos_in_page == 0 && (pos + len) >= size))) {
- netfs_clear_thp(page);
- SetPageUptodate(page);
+ netfs_skip_page_read(page, pos, len)) {
netfs_stat(&netfs_n_rh_write_zskip);
goto have_page_no_wait;
}
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index cfeaadf56bf0..330f65727c45 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -406,7 +406,7 @@ struct nfs_client *nfs_get_client(const struct nfs_client_initdata *cl_init)
if (cl_init->hostname == NULL) {
WARN_ON(1);
- return NULL;
+ return ERR_PTR(-EINVAL);
}
/* see if the client already exists */
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index d158a500c25c..d2103852475f 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -718,7 +718,7 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo,
if (unlikely(!p))
goto out_err;
fl->fh_array[i]->size = be32_to_cpup(p++);
- if (sizeof(struct nfs_fh) < fl->fh_array[i]->size) {
+ if (fl->fh_array[i]->size > NFS_MAXFHSIZE) {
printk(KERN_ERR "NFS: Too big fh %d received %d\n",
i, fl->fh_array[i]->size);
goto out_err;
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 93e60e921f92..bc0c698f3350 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -362,7 +362,7 @@ static const struct kernel_param_ops param_ops_nfs_timeout = {
.set = param_set_nfs_timeout,
.get = param_get_nfs_timeout,
};
-#define param_check_nfs_timeout(name, p) __param_check(name, p, int);
+#define param_check_nfs_timeout(name, p) __param_check(name, p, int)
module_param(nfs_mountpoint_expiry_timeout, nfs_timeout, 0644);
MODULE_PARM_DESC(nfs_mountpoint_expiry_timeout,
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 065cb04222a1..543d916f79ab 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -205,6 +205,7 @@ struct nfs4_exception {
struct inode *inode;
nfs4_stateid *stateid;
long timeout;
+ unsigned char task_is_privileged : 1;
unsigned char delay : 1,
recovering : 1,
retry : 1;
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 889a9f4c0310..42719384e25f 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -435,8 +435,8 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp,
*/
nfs_mark_client_ready(clp, -EPERM);
}
- nfs_put_client(clp);
clear_bit(NFS_CS_TSM_POSSIBLE, &clp->cl_flags);
+ nfs_put_client(clp);
return old;
error:
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index 57b3821d975a..a1e5c6b85ded 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -211,7 +211,7 @@ static loff_t nfs4_file_llseek(struct file *filep, loff_t offset, int whence)
case SEEK_HOLE:
case SEEK_DATA:
ret = nfs42_proc_llseek(filep, offset, whence);
- if (ret != -ENOTSUPP)
+ if (ret != -EOPNOTSUPP)
return ret;
fallthrough;
default:
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 87d04f2c9385..e653654c10bc 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -589,6 +589,8 @@ int nfs4_handle_exception(struct nfs_server *server, int errorcode, struct nfs4_
goto out_retry;
}
if (exception->recovering) {
+ if (exception->task_is_privileged)
+ return -EDEADLOCK;
ret = nfs4_wait_clnt_recover(clp);
if (test_bit(NFS_MIG_FAILED, &server->mig_status))
return -EIO;
@@ -614,6 +616,8 @@ nfs4_async_handle_exception(struct rpc_task *task, struct nfs_server *server,
goto out_retry;
}
if (exception->recovering) {
+ if (exception->task_is_privileged)
+ return -EDEADLOCK;
rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL);
if (test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) == 0)
rpc_wake_up_queued_task(&clp->cl_rpcwaitq, task);
@@ -1706,7 +1710,7 @@ static void nfs_set_open_stateid_locked(struct nfs4_state *state,
rcu_read_unlock();
trace_nfs4_open_stateid_update_wait(state->inode, stateid, 0);
- if (!signal_pending(current)) {
+ if (!fatal_signal_pending(current)) {
if (schedule_timeout(5*HZ) == 0)
status = -EAGAIN;
else
@@ -3487,7 +3491,7 @@ static bool nfs4_refresh_open_old_stateid(nfs4_stateid *dst,
write_sequnlock(&state->seqlock);
trace_nfs4_close_stateid_update_wait(state->inode, dst, 0);
- if (signal_pending(current))
+ if (fatal_signal_pending(current))
status = -EINTR;
else
if (schedule_timeout(5*HZ) != 0)
@@ -3878,6 +3882,10 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
server->caps |= NFS_CAP_HARDLINKS;
if (res.has_symlinks != 0)
server->caps |= NFS_CAP_SYMLINKS;
+#ifdef CONFIG_NFS_V4_SECURITY_LABEL
+ if (res.attr_bitmask[2] & FATTR4_WORD2_SECURITY_LABEL)
+ server->caps |= NFS_CAP_SECURITY_LABEL;
+#endif
if (!(res.attr_bitmask[0] & FATTR4_WORD0_FILEID))
server->fattr_valid &= ~NFS_ATTR_FATTR_FILEID;
if (!(res.attr_bitmask[1] & FATTR4_WORD1_MODE))
@@ -3898,10 +3906,6 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
server->fattr_valid &= ~NFS_ATTR_FATTR_CTIME;
if (!(res.attr_bitmask[1] & FATTR4_WORD1_TIME_MODIFY))
server->fattr_valid &= ~NFS_ATTR_FATTR_MTIME;
-#ifdef CONFIG_NFS_V4_SECURITY_LABEL
- if (!(res.attr_bitmask[2] & FATTR4_WORD2_SECURITY_LABEL))
- server->fattr_valid &= ~NFS_ATTR_FATTR_V4_SECURITY_LABEL;
-#endif
memcpy(server->attr_bitmask_nl, res.attr_bitmask,
sizeof(server->attr_bitmask));
server->attr_bitmask_nl[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
@@ -5968,6 +5972,14 @@ static int nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen
do {
err = __nfs4_proc_set_acl(inode, buf, buflen);
trace_nfs4_set_acl(inode, err);
+ if (err == -NFS4ERR_BADOWNER || err == -NFS4ERR_BADNAME) {
+ /*
+ * no need to retry since the kernel
+ * isn't involved in encoding the ACEs.
+ */
+ err = -EINVAL;
+ break;
+ }
err = nfs4_handle_exception(NFS_SERVER(inode), err,
&exception);
} while (exception.retry);
@@ -6409,6 +6421,7 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
struct nfs4_exception exception = {
.inode = data->inode,
.stateid = &data->stateid,
+ .task_is_privileged = data->args.seq_args.sa_privileged,
};
if (!nfs4_sequence_done(task, &data->res.seq_res))
@@ -6532,7 +6545,6 @@ static int _nfs4_proc_delegreturn(struct inode *inode, const struct cred *cred,
data = kzalloc(sizeof(*data), GFP_NOFS);
if (data == NULL)
return -ENOMEM;
- nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1, 0);
nfs4_state_protect(server->nfs_client,
NFS_SP4_MACH_CRED_CLEANUP,
@@ -6563,6 +6575,12 @@ static int _nfs4_proc_delegreturn(struct inode *inode, const struct cred *cred,
}
}
+ if (!data->inode)
+ nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1,
+ 1);
+ else
+ nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1,
+ 0);
task_setup_data.callback_data = data;
msg.rpc_argp = &data->args;
msg.rpc_resp = &data->res;
@@ -9640,15 +9658,20 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync)
&task_setup_data.rpc_client, &msg);
dprintk("--> %s\n", __func__);
+ lrp->inode = nfs_igrab_and_active(lrp->args.inode);
if (!sync) {
- lrp->inode = nfs_igrab_and_active(lrp->args.inode);
if (!lrp->inode) {
nfs4_layoutreturn_release(lrp);
return -EAGAIN;
}
task_setup_data.flags |= RPC_TASK_ASYNC;
}
- nfs4_init_sequence(&lrp->args.seq_args, &lrp->res.seq_res, 1, 0);
+ if (!lrp->inode)
+ nfs4_init_sequence(&lrp->args.seq_args, &lrp->res.seq_res, 1,
+ 1);
+ else
+ nfs4_init_sequence(&lrp->args.seq_args, &lrp->res.seq_res, 1,
+ 0);
task = rpc_run_task(&task_setup_data);
if (IS_ERR(task))
return PTR_ERR(task);
diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h
index eb1ef3462e84..ccef43e02b48 100644
--- a/fs/nfs/nfstrace.h
+++ b/fs/nfs/nfstrace.h
@@ -430,10 +430,6 @@ TRACE_DEFINE_ENUM(O_CLOEXEC);
{ O_NOATIME, "O_NOATIME" }, \
{ O_CLOEXEC, "O_CLOEXEC" })
-TRACE_DEFINE_ENUM(FMODE_READ);
-TRACE_DEFINE_ENUM(FMODE_WRITE);
-TRACE_DEFINE_ENUM(FMODE_EXEC);
-
#define show_fmode_flags(mode) \
__print_flags(mode, "|", \
{ ((__force unsigned long)FMODE_READ), "READ" }, \
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 6c20b28d9d7c..cf9cc62ec48e 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -1094,15 +1094,16 @@ nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc,
struct nfs_page *prev = NULL;
unsigned int size;
- if (mirror->pg_count != 0) {
- prev = nfs_list_entry(mirror->pg_list.prev);
- } else {
+ if (list_empty(&mirror->pg_list)) {
if (desc->pg_ops->pg_init)
desc->pg_ops->pg_init(desc, req);
if (desc->pg_error < 0)
return 0;
mirror->pg_base = req->wb_pgbase;
- }
+ mirror->pg_count = 0;
+ mirror->pg_recoalesce = 0;
+ } else
+ prev = nfs_list_entry(mirror->pg_list.prev);
if (desc->pg_maxretrans && req->wb_nio > desc->pg_maxretrans) {
if (NFS_SERVER(desc->pg_inode)->flags & NFS_MOUNT_SOFTERR)
@@ -1127,18 +1128,13 @@ static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc)
{
struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
-
if (!list_empty(&mirror->pg_list)) {
int error = desc->pg_ops->pg_doio(desc);
if (error < 0)
desc->pg_error = error;
- else
+ if (list_empty(&mirror->pg_list))
mirror->pg_bytes_written += mirror->pg_count;
}
- if (list_empty(&mirror->pg_list)) {
- mirror->pg_count = 0;
- mirror->pg_base = 0;
- }
}
static void
@@ -1227,10 +1223,6 @@ static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc)
do {
list_splice_init(&mirror->pg_list, &head);
- mirror->pg_bytes_written -= mirror->pg_count;
- mirror->pg_count = 0;
- mirror->pg_base = 0;
- mirror->pg_recoalesce = 0;
while (!list_empty(&head)) {
struct nfs_page *req;
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 03e0b34c4a64..2c01ee805306 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1317,6 +1317,11 @@ _pnfs_return_layout(struct inode *ino)
{
struct pnfs_layout_hdr *lo = NULL;
struct nfs_inode *nfsi = NFS_I(ino);
+ struct pnfs_layout_range range = {
+ .iomode = IOMODE_ANY,
+ .offset = 0,
+ .length = NFS4_MAX_UINT64,
+ };
LIST_HEAD(tmp_list);
const struct cred *cred;
nfs4_stateid stateid;
@@ -1344,16 +1349,10 @@ _pnfs_return_layout(struct inode *ino)
}
valid_layout = pnfs_layout_is_valid(lo);
pnfs_clear_layoutcommit(ino, &tmp_list);
- pnfs_mark_matching_lsegs_return(lo, &tmp_list, NULL, 0);
+ pnfs_mark_matching_lsegs_return(lo, &tmp_list, &range, 0);
- if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) {
- struct pnfs_layout_range range = {
- .iomode = IOMODE_ANY,
- .offset = 0,
- .length = NFS4_MAX_UINT64,
- };
+ if (NFS_SERVER(ino)->pnfs_curr_ld->return_range)
NFS_SERVER(ino)->pnfs_curr_ld->return_range(lo, &range);
- }
/* Don't send a LAYOUTRETURN if list was initially empty */
if (!test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags) ||
@@ -2678,7 +2677,7 @@ EXPORT_SYMBOL_GPL(pnfs_generic_pg_check_range);
void
pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
{
- u64 rd_size = req->wb_bytes;
+ u64 rd_size;
pnfs_generic_pg_check_layout(pgio);
pnfs_generic_pg_check_range(pgio, req);
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 19a212f9725d..fe58525cfed4 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1379,7 +1379,7 @@ static const struct kernel_param_ops param_ops_portnr = {
.set = param_set_portnr,
.get = param_get_uint,
};
-#define param_check_portnr(name, p) __param_check(name, p, unsigned int);
+#define param_check_portnr(name, p) __param_check(name, p, unsigned int)
module_param_named(callback_tcpport, nfs_callback_set_tcpport, portnr, 0644);
module_param_named(callback_nr_threads, nfs_callback_nr_threads, ushort, 0644);
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index b517a8794400..cd5eac2ba054 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2351,7 +2351,7 @@ static struct nfs4_client *get_nfsdfs_clp(struct inode *inode)
static void seq_quote_mem(struct seq_file *m, char *data, int len)
{
seq_printf(m, "\"");
- seq_escape_mem_ascii(m, data, len);
+ seq_escape_mem(m, data, len, ESCAPE_HEX | ESCAPE_NAP | ESCAPE_APPEND, "\"\\");
seq_printf(m, "\"");
}
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index f42ab57201e7..ab9ec073330f 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -738,7 +738,6 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *btree,
if (ptr2 != ptr + cnt || ++cnt == maxblocks)
goto end;
index++;
- continue;
}
if (level == maxlevel)
break;
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index c0361ce45f62..97769fe4d588 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -434,6 +434,7 @@ nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc)
static const struct address_space_operations def_mdt_aops = {
+ .set_page_dirty = __set_page_dirty_buffers,
.writepage = nilfs_mdt_write_page,
};
diff --git a/fs/nilfs2/sysfs.c b/fs/nilfs2/sysfs.c
index 303d71430bdd..68e8d61e28dd 100644
--- a/fs/nilfs2/sysfs.c
+++ b/fs/nilfs2/sysfs.c
@@ -19,19 +19,6 @@
/* /sys/fs/<nilfs>/ */
static struct kset *nilfs_kset;
-#define NILFS_SHOW_TIME(time_t_val, buf) ({ \
- struct tm res; \
- int count = 0; \
- time64_to_tm(time_t_val, 0, &res); \
- res.tm_year += 1900; \
- res.tm_mon += 1; \
- count = scnprintf(buf, PAGE_SIZE, \
- "%ld-%.2d-%.2d %.2d:%.2d:%.2d\n", \
- res.tm_year, res.tm_mon, res.tm_mday, \
- res.tm_hour, res.tm_min, res.tm_sec);\
- count; \
-})
-
#define NILFS_DEV_INT_GROUP_OPS(name, parent_name) \
static ssize_t nilfs_##name##_attr_show(struct kobject *kobj, \
struct attribute *attr, char *buf) \
@@ -576,7 +563,7 @@ nilfs_segctor_last_seg_write_time_show(struct nilfs_segctor_attr *attr,
ctime = nilfs->ns_ctime;
up_read(&nilfs->ns_segctor_sem);
- return NILFS_SHOW_TIME(ctime, buf);
+ return sysfs_emit(buf, "%ptTs\n", &ctime);
}
static ssize_t
@@ -604,7 +591,7 @@ nilfs_segctor_last_nongc_write_time_show(struct nilfs_segctor_attr *attr,
nongc_ctime = nilfs->ns_nongc_ctime;
up_read(&nilfs->ns_segctor_sem);
- return NILFS_SHOW_TIME(nongc_ctime, buf);
+ return sysfs_emit(buf, "%ptTs\n", &nongc_ctime);
}
static ssize_t
@@ -724,7 +711,7 @@ nilfs_superblock_sb_write_time_show(struct nilfs_superblock_attr *attr,
sbwtime = nilfs->ns_sbwtime;
up_read(&nilfs->ns_sem);
- return NILFS_SHOW_TIME(sbwtime, buf);
+ return sysfs_emit(buf, "%ptTs\n", &sbwtime);
}
static ssize_t
@@ -1053,6 +1040,7 @@ void nilfs_sysfs_delete_device_group(struct the_nilfs *nilfs)
nilfs_sysfs_delete_superblock_group(nilfs);
nilfs_sysfs_delete_segctor_group(nilfs);
kobject_del(&nilfs->ns_dev_kobj);
+ kobject_put(&nilfs->ns_dev_kobj);
kfree(nilfs->ns_dev_subgroups);
}
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 71fefb30e015..64864fb40b40 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -424,11 +424,18 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
* events generated by the listener process itself, without disclosing
* the pids of other processes.
*/
- if (!capable(CAP_SYS_ADMIN) &&
+ if (FAN_GROUP_FLAG(group, FANOTIFY_UNPRIV) &&
task_tgid(current) != event->pid)
metadata.pid = 0;
- if (path && path->mnt && path->dentry) {
+ /*
+ * For now, fid mode is required for an unprivileged listener and
+ * fid mode does not report fd in events. Keep this check anyway
+ * for safety in case fid mode requirement is relaxed in the future
+ * to allow unprivileged listener to get events with no fd and no fid.
+ */
+ if (!FAN_GROUP_FLAG(group, FANOTIFY_UNPRIV) &&
+ path && path->mnt && path->dentry) {
fd = create_fd(group, path, &f);
if (fd < 0)
return fd;
@@ -464,7 +471,7 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
info_type, fanotify_info_name(info),
info->name_len, buf, count);
if (ret < 0)
- return ret;
+ goto out_close_fd;
buf += ret;
count -= ret;
@@ -512,7 +519,7 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
fanotify_event_object_fh(event),
info_type, dot, dot_len, buf, count);
if (ret < 0)
- return ret;
+ goto out_close_fd;
buf += ret;
count -= ret;
@@ -1040,6 +1047,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
int f_flags, fd;
unsigned int fid_mode = flags & FANOTIFY_FID_BITS;
unsigned int class = flags & FANOTIFY_CLASS_BITS;
+ unsigned int internal_flags = 0;
pr_debug("%s: flags=%x event_f_flags=%x\n",
__func__, flags, event_f_flags);
@@ -1053,6 +1061,13 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
*/
if ((flags & FANOTIFY_ADMIN_INIT_FLAGS) || !fid_mode)
return -EPERM;
+
+ /*
+ * Setting the internal flag FANOTIFY_UNPRIV on the group
+ * prevents setting mount/filesystem marks on this group and
+ * prevents reporting pid and open fd in events.
+ */
+ internal_flags |= FANOTIFY_UNPRIV;
}
#ifdef CONFIG_AUDITSYSCALL
@@ -1105,7 +1120,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
goto out_destroy_group;
}
- group->fanotify_data.flags = flags;
+ group->fanotify_data.flags = flags | internal_flags;
group->memcg = get_mem_cgroup_from_mm(current->mm);
group->fanotify_data.merge_hash = fanotify_alloc_merge_hash();
@@ -1305,11 +1320,13 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
group = f.file->private_data;
/*
- * An unprivileged user is not allowed to watch a mount point nor
- * a filesystem.
+ * An unprivileged user is not allowed to setup mount nor filesystem
+ * marks. This also includes setting up such marks by a group that
+ * was initialized by an unprivileged user.
*/
ret = -EPERM;
- if (!capable(CAP_SYS_ADMIN) &&
+ if ((!capable(CAP_SYS_ADMIN) ||
+ FAN_GROUP_FLAG(group, FANOTIFY_UNPRIV)) &&
mark_type != FAN_MARK_INODE)
goto fput_and_out;
@@ -1460,6 +1477,7 @@ static int __init fanotify_user_setup(void)
max_marks = clamp(max_marks, FANOTIFY_OLD_DEFAULT_MAX_MARKS,
FANOTIFY_DEFAULT_MAX_USER_MARKS);
+ BUILD_BUG_ON(FANOTIFY_INIT_FLAGS & FANOTIFY_INTERNAL_GROUP_FLAGS);
BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 10);
BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 9);
diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c
index a712b2aaa9ac..57f0d5d9f934 100644
--- a/fs/notify/fdinfo.c
+++ b/fs/notify/fdinfo.c
@@ -144,7 +144,7 @@ void fanotify_show_fdinfo(struct seq_file *m, struct file *f)
struct fsnotify_group *group = f->private_data;
seq_printf(m, "fanotify flags:%x event-flags:%x\n",
- group->fanotify_data.flags,
+ group->fanotify_data.flags & FANOTIFY_INIT_FLAGS,
group->fanotify_data.f_flags);
show_fdinfo(m, f, fanotify_fdinfo);
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index f5c058b3192c..4474adb393ca 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -477,7 +477,7 @@ err_corrupt_attr:
}
file_name_attr = (FILE_NAME_ATTR*)((u8*)attr +
le16_to_cpu(attr->data.resident.value_offset));
- p2 = (u8*)attr + le32_to_cpu(attr->data.resident.value_length);
+ p2 = (u8 *)file_name_attr + le32_to_cpu(attr->data.resident.value_length);
if (p2 < (u8*)attr || p2 > p)
goto err_corrupt_attr;
/* This attribute is ok, but is it in the $Extend directory? */
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index e032f2e2c2c5..f1cc8258d34a 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -6018,7 +6018,7 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
* Then truncate log will be replayed resulting in cluster double free.
*/
jbd2_journal_lock_updates(journal->j_journal);
- status = jbd2_journal_flush(journal->j_journal);
+ status = jbd2_journal_flush(journal->j_journal, 0);
jbd2_journal_unlock_updates(journal->j_journal);
if (status < 0) {
mlog_errno(status);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 1294925ac94a..68d11c295dd3 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -632,8 +632,7 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
}
if (PageUptodate(page)) {
- if (!buffer_uptodate(bh))
- set_buffer_uptodate(bh);
+ set_buffer_uptodate(bh);
} else if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
!buffer_new(bh) &&
ocfs2_should_read_blk(inode, page, block_start) &&
@@ -2454,6 +2453,7 @@ static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
}
const struct address_space_operations ocfs2_aops = {
+ .set_page_dirty = __set_page_dirty_buffers,
.readpage = ocfs2_readpage,
.readahead = ocfs2_readahead,
.writepage = ocfs2_writepage,
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index e829c2595543..f89ffcbd585f 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -1442,8 +1442,6 @@ void o2hb_init(void)
for (i = 0; i < ARRAY_SIZE(o2hb_live_slots); i++)
INIT_LIST_HEAD(&o2hb_live_slots[i]);
- INIT_LIST_HEAD(&o2hb_node_events);
-
memset(o2hb_live_node_bitmap, 0, sizeof(o2hb_live_node_bitmap));
memset(o2hb_region_bitmap, 0, sizeof(o2hb_region_bitmap));
memset(o2hb_live_region_bitmap, 0, sizeof(o2hb_live_region_bitmap));
@@ -1598,12 +1596,13 @@ static ssize_t o2hb_region_start_block_store(struct config_item *item,
struct o2hb_region *reg = to_o2hb_region(item);
unsigned long long tmp;
char *p = (char *)page;
+ ssize_t ret;
if (reg->hr_bdev)
return -EINVAL;
- tmp = simple_strtoull(p, &p, 0);
- if (!p || (*p && (*p != '\n')))
+ ret = kstrtoull(p, 0, &tmp);
+ if (ret)
return -EINVAL;
reg->hr_start_block = tmp;
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index bb82e6b1ff4e..625c92521416 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -824,7 +824,7 @@ static void __exit exit_o2nm(void)
static int __init init_o2nm(void)
{
- int ret = -1;
+ int ret;
o2hb_init();
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 4960a6de768d..9b88219febb5 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -2977,7 +2977,7 @@ static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res)
{
enum dlm_lockres_list idx;
- struct list_head *queue = &res->granted;
+ struct list_head *queue;
struct dlm_lock *lock;
int noderef;
u8 nodenum = O2NM_MAX_NODES;
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index f17c3d33fb18..775657943057 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1856,6 +1856,45 @@ out:
}
/*
+ * zero out partial blocks of one cluster.
+ *
+ * start: file offset where zero starts, will be made upper block aligned.
+ * len: it will be trimmed to the end of current cluster if "start + len"
+ * is bigger than it.
+ */
+static int ocfs2_zeroout_partial_cluster(struct inode *inode,
+ u64 start, u64 len)
+{
+ int ret;
+ u64 start_block, end_block, nr_blocks;
+ u64 p_block, offset;
+ u32 cluster, p_cluster, nr_clusters;
+ struct super_block *sb = inode->i_sb;
+ u64 end = ocfs2_align_bytes_to_clusters(sb, start);
+
+ if (start + len < end)
+ end = start + len;
+
+ start_block = ocfs2_blocks_for_bytes(sb, start);
+ end_block = ocfs2_blocks_for_bytes(sb, end);
+ nr_blocks = end_block - start_block;
+ if (!nr_blocks)
+ return 0;
+
+ cluster = ocfs2_bytes_to_clusters(sb, start);
+ ret = ocfs2_get_clusters(inode, cluster, &p_cluster,
+ &nr_clusters, NULL);
+ if (ret)
+ return ret;
+ if (!p_cluster)
+ return 0;
+
+ offset = start_block - ocfs2_clusters_to_blocks(sb, cluster);
+ p_block = ocfs2_clusters_to_blocks(sb, p_cluster) + offset;
+ return sb_issue_zeroout(sb, p_block, nr_blocks, GFP_NOFS);
+}
+
+/*
* Parts of this function taken from xfs_change_file_space()
*/
static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
@@ -1865,7 +1904,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
{
int ret;
s64 llen;
- loff_t size;
+ loff_t size, orig_isize;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct buffer_head *di_bh = NULL;
handle_t *handle;
@@ -1896,6 +1935,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
goto out_inode_unlock;
}
+ orig_isize = i_size_read(inode);
switch (sr->l_whence) {
case 0: /*SEEK_SET*/
break;
@@ -1903,7 +1943,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
sr->l_start += f_pos;
break;
case 2: /*SEEK_END*/
- sr->l_start += i_size_read(inode);
+ sr->l_start += orig_isize;
break;
default:
ret = -EINVAL;
@@ -1957,6 +1997,14 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
default:
ret = -EINVAL;
}
+
+ /* zeroout eof blocks in the cluster. */
+ if (!ret && change_size && orig_isize < size) {
+ ret = ocfs2_zeroout_partial_cluster(inode, orig_isize,
+ size - orig_isize);
+ if (!ret)
+ i_size_write(inode, size);
+ }
up_write(&OCFS2_I(inode)->ip_alloc_sem);
if (ret) {
mlog_errno(ret);
@@ -1973,9 +2021,6 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
goto out_inode_unlock;
}
- if (change_size && i_size_read(inode) < size)
- i_size_write(inode, size);
-
inode->i_ctime = inode->i_mtime = current_time(inode);
ret = ocfs2_mark_inode_dirty(handle, inode, di_bh);
if (ret < 0)
diff --git a/fs/ocfs2/filecheck.c b/fs/ocfs2/filecheck.c
index 90b8d300c1ee..de56e6231af8 100644
--- a/fs/ocfs2/filecheck.c
+++ b/fs/ocfs2/filecheck.c
@@ -326,11 +326,7 @@ static ssize_t ocfs2_filecheck_attr_show(struct kobject *kobj,
ret = snprintf(buf + total, remain, "%lu\t\t%u\t%s\n",
p->fe_ino, p->fe_done,
ocfs2_filecheck_error(p->fe_status));
- if (ret < 0) {
- total = ret;
- break;
- }
- if (ret == remain) {
+ if (ret >= remain) {
/* snprintf() didn't fit */
total = -E2BIG;
break;
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 4e589ce2fce6..4f15750aac5d 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -308,7 +308,7 @@ static int ocfs2_commit_cache(struct ocfs2_super *osb)
}
jbd2_journal_lock_updates(journal->j_journal);
- status = jbd2_journal_flush(journal->j_journal);
+ status = jbd2_journal_flush(journal->j_journal, 0);
jbd2_journal_unlock_updates(journal->j_journal);
if (status < 0) {
up_write(&journal->j_trans_barrier);
@@ -1000,7 +1000,7 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
if (ocfs2_mount_local(osb)) {
jbd2_journal_lock_updates(journal->j_journal);
- status = jbd2_journal_flush(journal->j_journal);
+ status = jbd2_journal_flush(journal->j_journal, 0);
jbd2_journal_unlock_updates(journal->j_journal);
if (status < 0)
mlog_errno(status);
@@ -1070,7 +1070,7 @@ int ocfs2_journal_load(struct ocfs2_journal *journal, int local, int replayed)
if (replayed) {
jbd2_journal_lock_updates(journal->j_journal);
- status = jbd2_journal_flush(journal->j_journal);
+ status = jbd2_journal_flush(journal->j_journal, 0);
jbd2_journal_unlock_updates(journal->j_journal);
if (status < 0)
mlog_errno(status);
@@ -1666,7 +1666,7 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
/* wipe the journal */
jbd2_journal_lock_updates(journal);
- status = jbd2_journal_flush(journal);
+ status = jbd2_journal_flush(journal, 0);
jbd2_journal_unlock_updates(journal);
if (status < 0)
mlog_errno(status);
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index d50e8b8dfea4..16f1bfc407f2 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -500,11 +500,7 @@ static ssize_t ocfs2_loaded_cluster_plugins_show(struct kobject *kobj,
list_for_each_entry(p, &ocfs2_stack_list, sp_list) {
ret = snprintf(buf, remain, "%s\n",
p->sp_name);
- if (ret < 0) {
- total = ret;
- break;
- }
- if (ret == remain) {
+ if (ret >= remain) {
/* snprintf() didn't fit */
total = -E2BIG;
break;
@@ -531,7 +527,7 @@ static ssize_t ocfs2_active_cluster_plugin_show(struct kobject *kobj,
if (active_stack) {
ret = snprintf(buf, PAGE_SIZE, "%s\n",
active_stack->sp_name);
- if (ret == PAGE_SIZE)
+ if (ret >= PAGE_SIZE)
ret = -E2BIG;
}
spin_unlock(&ocfs2_stack_lock);
diff --git a/fs/omfs/file.c b/fs/omfs/file.c
index 11e733aab25d..89725b15a64b 100644
--- a/fs/omfs/file.c
+++ b/fs/omfs/file.c
@@ -372,6 +372,7 @@ const struct inode_operations omfs_file_inops = {
};
const struct address_space_operations omfs_aops = {
+ .set_page_dirty = __set_page_dirty_buffers,
.readpage = omfs_readpage,
.readahead = omfs_readahead,
.writepage = omfs_writepage,
diff --git a/fs/open.c b/fs/open.c
index e53af13b5835..1a325b3194df 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -852,8 +852,17 @@ static int do_dentry_open(struct file *f,
* XXX: Huge page cache doesn't support writing yet. Drop all page
* cache for this file before processing writes.
*/
- if ((f->f_mode & FMODE_WRITE) && filemap_nr_thps(inode->i_mapping))
- truncate_pagecache(inode, 0);
+ if (f->f_mode & FMODE_WRITE) {
+ /*
+ * Paired with smp_mb() in collapse_file() to ensure nr_thps
+ * is up to date and the update to i_writecount by
+ * get_write_access() is visible. Ensures subsequent insertion
+ * of THPs into the page cache will fail.
+ */
+ smp_mb();
+ if (filemap_nr_thps(inode->i_mapping))
+ truncate_pagecache(inode, 0);
+ }
return 0;
@@ -1002,12 +1011,20 @@ inline struct open_how build_open_how(int flags, umode_t mode)
inline int build_open_flags(const struct open_how *how, struct open_flags *op)
{
- int flags = how->flags;
+ u64 flags = how->flags;
+ u64 strip = FMODE_NONOTIFY | O_CLOEXEC;
int lookup_flags = 0;
int acc_mode = ACC_MODE(flags);
- /* Must never be set by userspace */
- flags &= ~(FMODE_NONOTIFY | O_CLOEXEC);
+ BUILD_BUG_ON_MSG(upper_32_bits(VALID_OPEN_FLAGS),
+ "struct open_flags doesn't yet handle flags > 32 bits");
+
+ /*
+ * Strip flags that either shouldn't be set by userspace like
+ * FMODE_NONOTIFY or that aren't relevant in determining struct
+ * open_flags like O_CLOEXEC.
+ */
+ flags &= ~strip;
/*
* Older syscalls implicitly clear all of the invalid flags or argument
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 7ec59171f197..ee0ce8cecc4a 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -284,7 +284,7 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p)
collect_sigign_sigcatch(p, &ignored, &caught);
num_threads = get_nr_threads(p);
rcu_read_lock(); /* FIXME: is this correct? */
- qsize = atomic_read(&__task_cred(p)->user->sigpending);
+ qsize = get_ucounts_value(task_ucounts(p), UCOUNT_RLIMIT_SIGPENDING);
rcu_read_unlock();
qlim = task_rlimit(p, RLIMIT_SIGPENDING);
unlock_task_sighand(p, &flags);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 3851bfcdba56..e5b5f7709d48 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -854,7 +854,7 @@ static ssize_t mem_rw(struct file *file, char __user *buf,
flags = FOLL_FORCE | (write ? FOLL_WRITE : 0);
while (count > 0) {
- int this_len = min_t(int, count, PAGE_SIZE);
+ size_t this_len = min_t(size_t, count, PAGE_SIZE);
if (write && copy_from_user(page, buf, this_len)) {
copied = -EFAULT;
@@ -2674,6 +2674,13 @@ out:
}
#ifdef CONFIG_SECURITY
+static int proc_pid_attr_open(struct inode *inode, struct file *file)
+{
+ file->private_data = NULL;
+ __mem_open(inode, file, PTRACE_MODE_READ_FSCREDS);
+ return 0;
+}
+
static ssize_t proc_pid_attr_read(struct file * file, char __user * buf,
size_t count, loff_t *ppos)
{
@@ -2703,6 +2710,10 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
void *page;
int rv;
+ /* A task may only write when it was the opener. */
+ if (file->private_data != current->mm)
+ return -EPERM;
+
rcu_read_lock();
task = pid_task(proc_pid(inode), PIDTYPE_PID);
if (!task) {
@@ -2750,9 +2761,11 @@ out:
}
static const struct file_operations proc_pid_attr_operations = {
+ .open = proc_pid_attr_open,
.read = proc_pid_attr_read,
.write = proc_pid_attr_write,
.llseek = generic_file_llseek,
+ .release = mem_release,
};
#define LSM_DIR_OPS(LSM) \
@@ -3159,7 +3172,7 @@ static const struct pid_entry tgid_base_stuff[] = {
DIR("task", S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations),
DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
DIR("map_files", S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations),
- DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
+ DIR("fdinfo", S_IRUGO|S_IXUGO, proc_fdinfo_inode_operations, proc_fdinfo_operations),
DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
#ifdef CONFIG_NET
DIR("net", S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations),
@@ -3504,7 +3517,7 @@ static const struct inode_operations proc_tid_comm_inode_operations = {
*/
static const struct pid_entry tid_base_stuff[] = {
DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
- DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
+ DIR("fdinfo", S_IRUGO|S_IXUGO, proc_fdinfo_inode_operations, proc_fdinfo_operations),
DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
#ifdef CONFIG_NET
DIR("net", S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations),
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 07fc4fad2602..172c86270b31 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -6,6 +6,7 @@
#include <linux/fdtable.h>
#include <linux/namei.h>
#include <linux/pid.h>
+#include <linux/ptrace.h>
#include <linux/security.h>
#include <linux/file.h>
#include <linux/seq_file.h>
@@ -53,9 +54,10 @@ static int seq_show(struct seq_file *m, void *v)
if (ret)
return ret;
- seq_printf(m, "pos:\t%lli\nflags:\t0%o\nmnt_id:\t%i\n",
+ seq_printf(m, "pos:\t%lli\nflags:\t0%o\nmnt_id:\t%i\nino:\t%lu\n",
(long long)file->f_pos, f_flags,
- real_mount(file->f_path.mnt)->mnt_id);
+ real_mount(file->f_path.mnt)->mnt_id,
+ file_inode(file)->i_ino);
/* show_fd_locks() never deferences files so a stale value is safe */
show_fd_locks(m, file, files);
@@ -72,6 +74,18 @@ out:
static int seq_fdinfo_open(struct inode *inode, struct file *file)
{
+ bool allowed = false;
+ struct task_struct *task = get_proc_task(inode);
+
+ if (!task)
+ return -ESRCH;
+
+ allowed = ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS);
+ put_task_struct(task);
+
+ if (!allowed)
+ return -EACCES;
+
return single_open(file, seq_show, inode);
}
@@ -308,7 +322,7 @@ static struct dentry *proc_fdinfo_instantiate(struct dentry *dentry,
struct proc_inode *ei;
struct inode *inode;
- inode = proc_pid_make_inode(dentry->d_sb, task, S_IFREG | S_IRUSR);
+ inode = proc_pid_make_inode(dentry->d_sb, task, S_IFREG | S_IRUGO);
if (!inode)
return ERR_PTR(-ENOENT);
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 4d2e64e9016c..982e694aae77 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -313,6 +313,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
{
char *buf = file->private_data;
size_t phdrs_offset, notes_offset, data_offset;
+ size_t page_offline_frozen = 1;
size_t phdrs_len, notes_len;
struct kcore_list *m;
size_t tsz;
@@ -322,6 +323,11 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
int ret = 0;
down_read(&kclist_lock);
+ /*
+ * Don't race against drivers that set PageOffline() and expect no
+ * further page access.
+ */
+ page_offline_freeze();
get_kcore_size(&nphdr, &phdrs_len, &notes_len, &data_offset);
phdrs_offset = sizeof(struct elfhdr);
@@ -380,11 +386,8 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
phdr->p_type = PT_LOAD;
phdr->p_flags = PF_R | PF_W | PF_X;
phdr->p_offset = kc_vaddr_to_offset(m->addr) + data_offset;
- if (m->type == KCORE_REMAP)
- phdr->p_vaddr = (size_t)m->vaddr;
- else
- phdr->p_vaddr = (size_t)m->addr;
- if (m->type == KCORE_RAM || m->type == KCORE_REMAP)
+ phdr->p_vaddr = (size_t)m->addr;
+ if (m->type == KCORE_RAM)
phdr->p_paddr = __pa(m->addr);
else if (m->type == KCORE_TEXT)
phdr->p_paddr = __pa_symbol(m->addr);
@@ -468,6 +471,9 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
m = NULL;
while (buflen) {
+ struct page *page;
+ unsigned long pfn;
+
/*
* If this is the first iteration or the address is not within
* the previous entry, search for a matching entry.
@@ -480,31 +486,57 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
}
}
+ if (page_offline_frozen++ % MAX_ORDER_NR_PAGES == 0) {
+ page_offline_thaw();
+ cond_resched();
+ page_offline_freeze();
+ }
+
if (&m->list == &kclist_head) {
if (clear_user(buffer, tsz)) {
ret = -EFAULT;
goto out;
}
m = NULL; /* skip the list anchor */
- } else if (!pfn_is_ram(__pa(start) >> PAGE_SHIFT)) {
- if (clear_user(buffer, tsz)) {
- ret = -EFAULT;
- goto out;
- }
- } else if (m->type == KCORE_VMALLOC) {
+ goto skip;
+ }
+
+ switch (m->type) {
+ case KCORE_VMALLOC:
vread(buf, (char *)start, tsz);
/* we have to zero-fill user buffer even if no read */
if (copy_to_user(buffer, buf, tsz)) {
ret = -EFAULT;
goto out;
}
- } else if (m->type == KCORE_USER) {
+ break;
+ case KCORE_USER:
/* User page is handled prior to normal kernel page: */
if (copy_to_user(buffer, (char *)start, tsz)) {
ret = -EFAULT;
goto out;
}
- } else {
+ break;
+ case KCORE_RAM:
+ pfn = __pa(start) >> PAGE_SHIFT;
+ page = pfn_to_online_page(pfn);
+
+ /*
+ * Don't read offline sections, logically offline pages
+ * (e.g., inflated in a balloon), hwpoisoned pages,
+ * and explicitly excluded physical ranges.
+ */
+ if (!page || PageOffline(page) ||
+ is_page_hwpoison(page) || !pfn_is_ram(pfn)) {
+ if (clear_user(buffer, tsz)) {
+ ret = -EFAULT;
+ goto out;
+ }
+ break;
+ }
+ fallthrough;
+ case KCORE_VMEMMAP:
+ case KCORE_TEXT:
if (kern_addr_valid(start)) {
/*
* Using bounce buffer to bypass the
@@ -528,7 +560,15 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
goto out;
}
}
+ break;
+ default:
+ pr_warn_once("Unhandled KCORE type: %d\n", m->type);
+ if (clear_user(buffer, tsz)) {
+ ret = -EFAULT;
+ goto out;
+ }
}
+skip:
buflen -= tsz;
*fpos += tsz;
buffer += tsz;
@@ -537,6 +577,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
}
out:
+ page_offline_thaw();
up_read(&kclist_lock);
if (ret)
return ret;
diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c
index 8468baee951d..f32878d9a39f 100644
--- a/fs/proc/loadavg.c
+++ b/fs/proc/loadavg.c
@@ -16,7 +16,7 @@ static int loadavg_proc_show(struct seq_file *m, void *v)
get_avenrun(avnrun, FIXED_1/200, 0);
- seq_printf(m, "%lu.%02lu %lu.%02lu %lu.%02lu %ld/%d %d\n",
+ seq_printf(m, "%lu.%02lu %lu.%02lu %lu.%02lu %u/%d %d\n",
LOAD_INT(avnrun[0]), LOAD_FRAC(avnrun[0]),
LOAD_INT(avnrun[1]), LOAD_FRAC(avnrun[1]),
LOAD_INT(avnrun[2]), LOAD_FRAC(avnrun[2]),
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index f25e8531fd27..6561a06ef905 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -200,8 +200,8 @@ static int show_stat(struct seq_file *p, void *v)
"\nctxt %llu\n"
"btime %llu\n"
"processes %lu\n"
- "procs_running %lu\n"
- "procs_blocked %lu\n",
+ "procs_running %u\n"
+ "procs_blocked %u\n",
nr_context_switches(),
(unsigned long long)boottime.tv_sec,
total_forks,
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index fc9784544b24..eb97468dfe4c 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -514,10 +514,8 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
} else {
mss->swap_pss += (u64)PAGE_SIZE << PSS_SHIFT;
}
- } else if (is_migration_entry(swpent))
- page = migration_entry_to_page(swpent);
- else if (is_device_private_entry(swpent))
- page = device_private_entry_to_page(swpent);
+ } else if (is_pfn_swap_entry(swpent))
+ page = pfn_swap_entry_to_page(swpent);
} else if (unlikely(IS_ENABLED(CONFIG_SHMEM) && mss->check_shmem_swap
&& pte_none(*pte))) {
page = xa_load(&vma->vm_file->f_mapping->i_pages,
@@ -549,7 +547,7 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
swp_entry_t entry = pmd_to_swp_entry(*pmd);
if (is_migration_entry(entry))
- page = migration_entry_to_page(entry);
+ page = pfn_swap_entry_to_page(entry);
}
if (IS_ERR_OR_NULL(page))
return;
@@ -694,10 +692,8 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask,
} else if (is_swap_pte(*pte)) {
swp_entry_t swpent = pte_to_swp_entry(*pte);
- if (is_migration_entry(swpent))
- page = migration_entry_to_page(swpent);
- else if (is_device_private_entry(swpent))
- page = device_private_entry_to_page(swpent);
+ if (is_pfn_swap_entry(swpent))
+ page = pfn_swap_entry_to_page(swpent);
}
if (page) {
int mapcount = page_mapcount(page);
@@ -832,7 +828,7 @@ static int show_smap(struct seq_file *m, void *v)
__show_smap(m, &mss, false);
seq_printf(m, "THPeligible: %d\n",
- transparent_hugepage_enabled(vma));
+ transparent_hugepage_active(vma));
if (arch_pkeys_enabled())
seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma));
@@ -1047,7 +1043,7 @@ static inline bool pte_is_pinned(struct vm_area_struct *vma, unsigned long addr,
return false;
if (!is_cow_mapping(vma->vm_flags))
return false;
- if (likely(!atomic_read(&vma->vm_mm->has_pinned)))
+ if (likely(!test_bit(MMF_HAS_PINNED, &vma->vm_mm->flags)))
return false;
page = vm_normal_page(vma, addr, pte);
if (!page)
@@ -1302,6 +1298,7 @@ struct pagemapread {
#define PM_PFRAME_MASK GENMASK_ULL(PM_PFRAME_BITS - 1, 0)
#define PM_SOFT_DIRTY BIT_ULL(55)
#define PM_MMAP_EXCLUSIVE BIT_ULL(56)
+#define PM_UFFD_WP BIT_ULL(57)
#define PM_FILE BIT_ULL(61)
#define PM_SWAP BIT_ULL(62)
#define PM_PRESENT BIT_ULL(63)
@@ -1375,20 +1372,21 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
page = vm_normal_page(vma, addr, pte);
if (pte_soft_dirty(pte))
flags |= PM_SOFT_DIRTY;
+ if (pte_uffd_wp(pte))
+ flags |= PM_UFFD_WP;
} else if (is_swap_pte(pte)) {
swp_entry_t entry;
if (pte_swp_soft_dirty(pte))
flags |= PM_SOFT_DIRTY;
+ if (pte_swp_uffd_wp(pte))
+ flags |= PM_UFFD_WP;
entry = pte_to_swp_entry(pte);
if (pm->show_pfn)
frame = swp_type(entry) |
(swp_offset(entry) << MAX_SWAPFILES_SHIFT);
flags |= PM_SWAP;
- if (is_migration_entry(entry))
- page = migration_entry_to_page(entry);
-
- if (is_device_private_entry(entry))
- page = device_private_entry_to_page(entry);
+ if (is_pfn_swap_entry(entry))
+ page = pfn_swap_entry_to_page(entry);
}
if (page && !PageAnon(page))
@@ -1426,6 +1424,8 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
flags |= PM_PRESENT;
if (pmd_soft_dirty(pmd))
flags |= PM_SOFT_DIRTY;
+ if (pmd_uffd_wp(pmd))
+ flags |= PM_UFFD_WP;
if (pm->show_pfn)
frame = pmd_pfn(pmd) +
((addr & ~PMD_MASK) >> PAGE_SHIFT);
@@ -1444,8 +1444,10 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
flags |= PM_SWAP;
if (pmd_swp_soft_dirty(pmd))
flags |= PM_SOFT_DIRTY;
+ if (pmd_swp_uffd_wp(pmd))
+ flags |= PM_UFFD_WP;
VM_BUG_ON(!is_pmd_migration_entry(pmd));
- page = migration_entry_to_page(entry);
+ page = pfn_swap_entry_to_page(entry);
}
#endif
diff --git a/fs/pstore/Kconfig b/fs/pstore/Kconfig
index 8adabde685f1..328da35da390 100644
--- a/fs/pstore/Kconfig
+++ b/fs/pstore/Kconfig
@@ -173,6 +173,7 @@ config PSTORE_BLK
tristate "Log panic/oops to a block device"
depends on PSTORE
depends on BLOCK
+ depends on BROKEN
select PSTORE_ZONE
default n
help
diff --git a/fs/pstore/blk.c b/fs/pstore/blk.c
index 4bb8a344957a..04ce58c939a0 100644
--- a/fs/pstore/blk.c
+++ b/fs/pstore/blk.c
@@ -8,15 +8,16 @@
#include <linux/kernel.h>
#include <linux/module.h>
-#include "../../block/blk.h"
#include <linux/blkdev.h>
#include <linux/string.h>
#include <linux/of.h>
#include <linux/of_address.h>
#include <linux/platform_device.h>
#include <linux/pstore_blk.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/init_syscalls.h>
#include <linux/mount.h>
-#include <linux/uio.h>
static long kmsg_size = CONFIG_PSTORE_BLK_KMSG_SIZE;
module_param(kmsg_size, long, 0400);
@@ -57,27 +58,7 @@ MODULE_PARM_DESC(best_effort, "use best effort to write (i.e. do not require sto
/*
* blkdev - the block device to use for pstore storage
- *
- * Usually, this will be a partition of a block device.
- *
- * blkdev accepts the following variants:
- * 1) <hex_major><hex_minor> device number in hexadecimal representation,
- * with no leading 0x, for example b302.
- * 2) /dev/<disk_name> represents the device number of disk
- * 3) /dev/<disk_name><decimal> represents the device number
- * of partition - device number of disk plus the partition number
- * 4) /dev/<disk_name>p<decimal> - same as the above, that form is
- * used when disk name of partitioned disk ends on a digit.
- * 5) PARTUUID=00112233-4455-6677-8899-AABBCCDDEEFF representing the
- * unique id of a partition if the partition table provides it.
- * The UUID may be either an EFI/GPT UUID, or refer to an MSDOS
- * partition using the format SSSSSSSS-PP, where SSSSSSSS is a zero-
- * filled hex representation of the 32-bit "NT disk signature", and PP
- * is a zero-filled hex representation of the 1-based partition number.
- * 6) PARTUUID=<UUID>/PARTNROFF=<int> to select a partition in relation to
- * a partition with a known unique id.
- * 7) <major>:<minor> major and minor number of the device separated by
- * a colon.
+ * See Documentation/admin-guide/pstore-blk.rst for details.
*/
static char blkdev[80] = CONFIG_PSTORE_BLK_BLKDEV;
module_param_string(blkdev, blkdev, 80, 0400);
@@ -88,14 +69,8 @@ MODULE_PARM_DESC(blkdev, "block device for pstore storage");
* during the register/unregister functions.
*/
static DEFINE_MUTEX(pstore_blk_lock);
-static struct block_device *psblk_bdev;
-static struct pstore_zone_info *pstore_zone_info;
-
-struct bdev_info {
- dev_t devt;
- sector_t nr_sects;
- sector_t start_sect;
-};
+static struct file *psblk_file;
+static struct pstore_device_info *pstore_device_info;
#define check_size(name, alignsize) ({ \
long _##name_ = (name); \
@@ -108,57 +83,63 @@ struct bdev_info {
_##name_; \
})
+#define verify_size(name, alignsize, enabled) { \
+ long _##name_; \
+ if (enabled) \
+ _##name_ = check_size(name, alignsize); \
+ else \
+ _##name_ = 0; \
+ /* Synchronize module parameters with resuls. */ \
+ name = _##name_ / 1024; \
+ dev->zone.name = _##name_; \
+}
+
static int __register_pstore_device(struct pstore_device_info *dev)
{
int ret;
lockdep_assert_held(&pstore_blk_lock);
- if (!dev || !dev->total_size || !dev->read || !dev->write)
+ if (!dev) {
+ pr_err("NULL device info\n");
+ return -EINVAL;
+ }
+ if (!dev->zone.total_size) {
+ pr_err("zero sized device\n");
return -EINVAL;
+ }
+ if (!dev->zone.read) {
+ pr_err("no read handler for device\n");
+ return -EINVAL;
+ }
+ if (!dev->zone.write) {
+ pr_err("no write handler for device\n");
+ return -EINVAL;
+ }
/* someone already registered before */
- if (pstore_zone_info)
+ if (pstore_device_info)
return -EBUSY;
- pstore_zone_info = kzalloc(sizeof(struct pstore_zone_info), GFP_KERNEL);
- if (!pstore_zone_info)
- return -ENOMEM;
-
/* zero means not limit on which backends to attempt to store. */
if (!dev->flags)
dev->flags = UINT_MAX;
-#define verify_size(name, alignsize, enabled) { \
- long _##name_; \
- if (enabled) \
- _##name_ = check_size(name, alignsize); \
- else \
- _##name_ = 0; \
- name = _##name_ / 1024; \
- pstore_zone_info->name = _##name_; \
- }
-
+ /* Copy in module parameters. */
verify_size(kmsg_size, 4096, dev->flags & PSTORE_FLAGS_DMESG);
verify_size(pmsg_size, 4096, dev->flags & PSTORE_FLAGS_PMSG);
verify_size(console_size, 4096, dev->flags & PSTORE_FLAGS_CONSOLE);
verify_size(ftrace_size, 4096, dev->flags & PSTORE_FLAGS_FTRACE);
-#undef verify_size
-
- pstore_zone_info->total_size = dev->total_size;
- pstore_zone_info->max_reason = max_reason;
- pstore_zone_info->read = dev->read;
- pstore_zone_info->write = dev->write;
- pstore_zone_info->erase = dev->erase;
- pstore_zone_info->panic_write = dev->panic_write;
- pstore_zone_info->name = KBUILD_MODNAME;
- pstore_zone_info->owner = THIS_MODULE;
-
- ret = register_pstore_zone(pstore_zone_info);
- if (ret) {
- kfree(pstore_zone_info);
- pstore_zone_info = NULL;
- }
+ dev->zone.max_reason = max_reason;
+
+ /* Initialize required zone ownership details. */
+ dev->zone.name = KBUILD_MODNAME;
+ dev->zone.owner = THIS_MODULE;
+
+ ret = register_pstore_zone(&dev->zone);
+ if (ret == 0)
+ pstore_device_info = dev;
+
return ret;
}
/**
@@ -185,10 +166,9 @@ EXPORT_SYMBOL_GPL(register_pstore_device);
static void __unregister_pstore_device(struct pstore_device_info *dev)
{
lockdep_assert_held(&pstore_blk_lock);
- if (pstore_zone_info && pstore_zone_info->read == dev->read) {
- unregister_pstore_zone(pstore_zone_info);
- kfree(pstore_zone_info);
- pstore_zone_info = NULL;
+ if (pstore_device_info && pstore_device_info == dev) {
+ unregister_pstore_zone(&dev->zone);
+ pstore_device_info = NULL;
}
}
@@ -205,204 +185,59 @@ void unregister_pstore_device(struct pstore_device_info *dev)
}
EXPORT_SYMBOL_GPL(unregister_pstore_device);
-/**
- * psblk_get_bdev() - open block device
- *
- * @holder: Exclusive holder identifier
- * @info: Information about bdev to fill in
- *
- * Return: pointer to block device on success and others on error.
- *
- * On success, the returned block_device has reference count of one.
- */
-static struct block_device *psblk_get_bdev(void *holder,
- struct bdev_info *info)
-{
- struct block_device *bdev = ERR_PTR(-ENODEV);
- fmode_t mode = FMODE_READ | FMODE_WRITE;
- sector_t nr_sects;
-
- lockdep_assert_held(&pstore_blk_lock);
-
- if (pstore_zone_info)
- return ERR_PTR(-EBUSY);
-
- if (!blkdev[0])
- return ERR_PTR(-ENODEV);
-
- if (holder)
- mode |= FMODE_EXCL;
- bdev = blkdev_get_by_path(blkdev, mode, holder);
- if (IS_ERR(bdev)) {
- dev_t devt;
-
- devt = name_to_dev_t(blkdev);
- if (devt == 0)
- return ERR_PTR(-ENODEV);
- bdev = blkdev_get_by_dev(devt, mode, holder);
- if (IS_ERR(bdev))
- return bdev;
- }
-
- nr_sects = bdev_nr_sectors(bdev);
- if (!nr_sects) {
- pr_err("not enough space for '%s'\n", blkdev);
- blkdev_put(bdev, mode);
- return ERR_PTR(-ENOSPC);
- }
-
- if (info) {
- info->devt = bdev->bd_dev;
- info->nr_sects = nr_sects;
- info->start_sect = get_start_sect(bdev);
- }
-
- return bdev;
-}
-
-static void psblk_put_bdev(struct block_device *bdev, void *holder)
-{
- fmode_t mode = FMODE_READ | FMODE_WRITE;
-
- lockdep_assert_held(&pstore_blk_lock);
-
- if (!bdev)
- return;
-
- if (holder)
- mode |= FMODE_EXCL;
- blkdev_put(bdev, mode);
-}
-
static ssize_t psblk_generic_blk_read(char *buf, size_t bytes, loff_t pos)
{
- struct block_device *bdev = psblk_bdev;
- struct file file;
- struct kiocb kiocb;
- struct iov_iter iter;
- struct kvec iov = {.iov_base = buf, .iov_len = bytes};
-
- if (!bdev)
- return -ENODEV;
-
- memset(&file, 0, sizeof(struct file));
- file.f_mapping = bdev->bd_inode->i_mapping;
- file.f_flags = O_DSYNC | __O_SYNC | O_NOATIME;
- file.f_inode = bdev->bd_inode;
- file_ra_state_init(&file.f_ra, file.f_mapping);
-
- init_sync_kiocb(&kiocb, &file);
- kiocb.ki_pos = pos;
- iov_iter_kvec(&iter, READ, &iov, 1, bytes);
-
- return generic_file_read_iter(&kiocb, &iter);
+ return kernel_read(psblk_file, buf, bytes, &pos);
}
static ssize_t psblk_generic_blk_write(const char *buf, size_t bytes,
loff_t pos)
{
- struct block_device *bdev = psblk_bdev;
- struct iov_iter iter;
- struct kiocb kiocb;
- struct file file;
- ssize_t ret;
- struct kvec iov = {.iov_base = (void *)buf, .iov_len = bytes};
-
- if (!bdev)
- return -ENODEV;
-
/* Console/Ftrace backend may handle buffer until flush dirty zones */
if (in_interrupt() || irqs_disabled())
return -EBUSY;
-
- memset(&file, 0, sizeof(struct file));
- file.f_mapping = bdev->bd_inode->i_mapping;
- file.f_flags = O_DSYNC | __O_SYNC | O_NOATIME;
- file.f_inode = bdev->bd_inode;
-
- init_sync_kiocb(&kiocb, &file);
- kiocb.ki_pos = pos;
- iov_iter_kvec(&iter, WRITE, &iov, 1, bytes);
-
- inode_lock(bdev->bd_inode);
- ret = generic_write_checks(&kiocb, &iter);
- if (ret > 0)
- ret = generic_perform_write(&file, &iter, pos);
- inode_unlock(bdev->bd_inode);
-
- if (likely(ret > 0)) {
- const struct file_operations f_op = {.fsync = blkdev_fsync};
-
- file.f_op = &f_op;
- kiocb.ki_pos += ret;
- ret = generic_write_sync(&kiocb, ret);
- }
- return ret;
+ return kernel_write(psblk_file, buf, bytes, &pos);
}
/*
* This takes its configuration only from the module parameters now.
- * See psblk_get_bdev() and blkdev.
*/
-static int __register_pstore_blk(void)
+static int __register_pstore_blk(struct pstore_device_info *dev,
+ const char *devpath)
{
- char bdev_name[BDEVNAME_SIZE];
- struct block_device *bdev;
- struct pstore_device_info dev;
- struct bdev_info binfo;
- void *holder = blkdev;
+ struct inode *inode;
int ret = -ENODEV;
lockdep_assert_held(&pstore_blk_lock);
- /* hold bdev exclusively */
- memset(&binfo, 0, sizeof(binfo));
- bdev = psblk_get_bdev(holder, &binfo);
- if (IS_ERR(bdev)) {
- pr_err("failed to open '%s'!\n", blkdev);
- return PTR_ERR(bdev);
+ psblk_file = filp_open(devpath, O_RDWR | O_DSYNC | O_NOATIME | O_EXCL, 0);
+ if (IS_ERR(psblk_file)) {
+ ret = PTR_ERR(psblk_file);
+ pr_err("failed to open '%s': %d!\n", devpath, ret);
+ goto err;
}
- /* only allow driver matching the @blkdev */
- if (!binfo.devt) {
- pr_debug("no major\n");
- ret = -ENODEV;
- goto err_put_bdev;
+ inode = file_inode(psblk_file);
+ if (!S_ISBLK(inode->i_mode)) {
+ pr_err("'%s' is not block device!\n", devpath);
+ goto err_fput;
}
- /* psblk_bdev must be assigned before register to pstore/blk */
- psblk_bdev = bdev;
-
- memset(&dev, 0, sizeof(dev));
- dev.total_size = binfo.nr_sects << SECTOR_SHIFT;
- dev.read = psblk_generic_blk_read;
- dev.write = psblk_generic_blk_write;
+ inode = I_BDEV(psblk_file->f_mapping->host)->bd_inode;
+ dev->zone.total_size = i_size_read(inode);
- ret = __register_pstore_device(&dev);
+ ret = __register_pstore_device(dev);
if (ret)
- goto err_put_bdev;
+ goto err_fput;
- bdevname(bdev, bdev_name);
- pr_info("attached %s (no dedicated panic_write!)\n", bdev_name);
return 0;
-err_put_bdev:
- psblk_bdev = NULL;
- psblk_put_bdev(bdev, holder);
- return ret;
-}
-
-static void __unregister_pstore_blk(unsigned int major)
-{
- struct pstore_device_info dev = { .read = psblk_generic_blk_read };
- void *holder = blkdev;
+err_fput:
+ fput(psblk_file);
+err:
+ psblk_file = NULL;
- lockdep_assert_held(&pstore_blk_lock);
- if (psblk_bdev && MAJOR(psblk_bdev->bd_dev) == major) {
- __unregister_pstore_device(&dev);
- psblk_put_bdev(psblk_bdev, holder);
- psblk_bdev = NULL;
- }
+ return ret;
}
/* get information of pstore/blk */
@@ -419,13 +254,93 @@ int pstore_blk_get_config(struct pstore_blk_config *info)
}
EXPORT_SYMBOL_GPL(pstore_blk_get_config);
+
+#ifndef MODULE
+static const char devname[] = "/dev/pstore-blk";
+static __init const char *early_boot_devpath(const char *initial_devname)
+{
+ /*
+ * During early boot the real root file system hasn't been
+ * mounted yet, and no device nodes are present yet. Use the
+ * same scheme to find the device that we use for mounting
+ * the root file system.
+ */
+ dev_t dev = name_to_dev_t(initial_devname);
+
+ if (!dev) {
+ pr_err("failed to resolve '%s'!\n", initial_devname);
+ return initial_devname;
+ }
+
+ init_unlink(devname);
+ init_mknod(devname, S_IFBLK | 0600, new_encode_dev(dev));
+
+ return devname;
+}
+#else
+static inline const char *early_boot_devpath(const char *initial_devname)
+{
+ return initial_devname;
+}
+#endif
+
+static int __init __best_effort_init(void)
+{
+ struct pstore_device_info *best_effort_dev;
+ int ret;
+
+ /* No best-effort mode requested. */
+ if (!best_effort)
+ return 0;
+
+ /* Reject an empty blkdev. */
+ if (!blkdev[0]) {
+ pr_err("blkdev empty with best_effort=Y\n");
+ return -EINVAL;
+ }
+
+ best_effort_dev = kzalloc(sizeof(*best_effort_dev), GFP_KERNEL);
+ if (!best_effort_dev)
+ return -ENOMEM;
+
+ best_effort_dev->zone.read = psblk_generic_blk_read;
+ best_effort_dev->zone.write = psblk_generic_blk_write;
+
+ ret = __register_pstore_blk(best_effort_dev,
+ early_boot_devpath(blkdev));
+ if (ret)
+ kfree(best_effort_dev);
+ else
+ pr_info("attached %s (%zu) (no dedicated panic_write!)\n",
+ blkdev, best_effort_dev->zone.total_size);
+
+ return ret;
+}
+
+static void __exit __best_effort_exit(void)
+{
+ /*
+ * Currently, the only user of psblk_file is best_effort, so
+ * we can assume that pstore_device_info is associated with it.
+ * Once there are "real" blk devices, there will need to be a
+ * dedicated pstore_blk_info, etc.
+ */
+ if (psblk_file) {
+ struct pstore_device_info *dev = pstore_device_info;
+
+ __unregister_pstore_device(dev);
+ kfree(dev);
+ fput(psblk_file);
+ psblk_file = NULL;
+ }
+}
+
static int __init pstore_blk_init(void)
{
- int ret = 0;
+ int ret;
mutex_lock(&pstore_blk_lock);
- if (!pstore_zone_info && best_effort && blkdev[0])
- ret = __register_pstore_blk();
+ ret = __best_effort_init();
mutex_unlock(&pstore_blk_lock);
return ret;
@@ -435,15 +350,9 @@ late_initcall(pstore_blk_init);
static void __exit pstore_blk_exit(void)
{
mutex_lock(&pstore_blk_lock);
- if (psblk_bdev)
- __unregister_pstore_blk(MAJOR(psblk_bdev->bd_dev));
- else {
- struct pstore_device_info dev = { };
-
- if (pstore_zone_info)
- dev.read = pstore_zone_info->read;
- __unregister_pstore_device(&dev);
- }
+ __best_effort_exit();
+ /* If we've been asked to unload, unregister any remaining device. */
+ __unregister_pstore_device(pstore_device_info);
mutex_unlock(&pstore_blk_lock);
}
module_exit(pstore_blk_exit);
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index 05e4bd9ab6d6..2bcc9a6f1bfc 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -968,31 +968,30 @@ out:
return ret;
}
-SYSCALL_DEFINE4(quotactl_path, unsigned int, cmd, const char __user *,
- mountpoint, qid_t, id, void __user *, addr)
+SYSCALL_DEFINE4(quotactl_fd, unsigned int, fd, unsigned int, cmd,
+ qid_t, id, void __user *, addr)
{
struct super_block *sb;
- struct path mountpath;
unsigned int cmds = cmd >> SUBCMDSHIFT;
unsigned int type = cmd & SUBCMDMASK;
+ struct fd f;
int ret;
- if (type >= MAXQUOTAS)
- return -EINVAL;
+ f = fdget_raw(fd);
+ if (!f.file)
+ return -EBADF;
- ret = user_path_at(AT_FDCWD, mountpoint,
- LOOKUP_FOLLOW | LOOKUP_AUTOMOUNT, &mountpath);
- if (ret)
- return ret;
-
- sb = mountpath.mnt->mnt_sb;
+ ret = -EINVAL;
+ if (type >= MAXQUOTAS)
+ goto out;
if (quotactl_cmd_write(cmds)) {
- ret = mnt_want_write(mountpath.mnt);
+ ret = mnt_want_write(f.file->f_path.mnt);
if (ret)
goto out;
}
+ sb = f.file->f_path.mnt->mnt_sb;
if (quotactl_cmd_onoff(cmds))
down_write(&sb->s_umount);
else
@@ -1006,9 +1005,8 @@ SYSCALL_DEFINE4(quotactl_path, unsigned int, cmd, const char __user *,
up_read(&sb->s_umount);
if (quotactl_cmd_write(cmds))
- mnt_drop_write(mountpath.mnt);
+ mnt_drop_write(f.file->f_path.mnt);
out:
- path_put(&mountpath);
-
+ fdput(f);
return ret;
}
diff --git a/fs/quota/quota_tree.c b/fs/quota/quota_tree.c
index c5562c871c8b..d3e995e1046f 100644
--- a/fs/quota/quota_tree.c
+++ b/fs/quota/quota_tree.c
@@ -47,15 +47,6 @@ static int qtree_dqstr_in_blk(struct qtree_mem_dqinfo *info)
/ info->dqi_entry_size;
}
-static char *getdqbuf(size_t size)
-{
- char *buf = kmalloc(size, GFP_NOFS);
- if (!buf)
- printk(KERN_WARNING
- "VFS: Not enough memory for quota buffers.\n");
- return buf;
-}
-
static ssize_t read_blk(struct qtree_mem_dqinfo *info, uint blk, char *buf)
{
struct super_block *sb = info->dqi_sb;
@@ -83,7 +74,7 @@ static ssize_t write_blk(struct qtree_mem_dqinfo *info, uint blk, char *buf)
/* Remove empty block from list and return it */
static int get_free_dqblk(struct qtree_mem_dqinfo *info)
{
- char *buf = getdqbuf(info->dqi_usable_bs);
+ char *buf = kmalloc(info->dqi_usable_bs, GFP_NOFS);
struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf;
int ret, blk;
@@ -132,7 +123,7 @@ static int put_free_dqblk(struct qtree_mem_dqinfo *info, char *buf, uint blk)
static int remove_free_dqentry(struct qtree_mem_dqinfo *info, char *buf,
uint blk)
{
- char *tmpbuf = getdqbuf(info->dqi_usable_bs);
+ char *tmpbuf = kmalloc(info->dqi_usable_bs, GFP_NOFS);
struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf;
uint nextblk = le32_to_cpu(dh->dqdh_next_free);
uint prevblk = le32_to_cpu(dh->dqdh_prev_free);
@@ -179,7 +170,7 @@ out_buf:
static int insert_free_dqentry(struct qtree_mem_dqinfo *info, char *buf,
uint blk)
{
- char *tmpbuf = getdqbuf(info->dqi_usable_bs);
+ char *tmpbuf = kmalloc(info->dqi_usable_bs, GFP_NOFS);
struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf;
int err;
@@ -227,7 +218,7 @@ static uint find_free_dqentry(struct qtree_mem_dqinfo *info,
{
uint blk, i;
struct qt_disk_dqdbheader *dh;
- char *buf = getdqbuf(info->dqi_usable_bs);
+ char *buf = kmalloc(info->dqi_usable_bs, GFP_NOFS);
char *ddquot;
*err = 0;
@@ -298,7 +289,7 @@ out_buf:
static int do_insert_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
uint *treeblk, int depth)
{
- char *buf = getdqbuf(info->dqi_usable_bs);
+ char *buf = kmalloc(info->dqi_usable_bs, GFP_NOFS);
int ret = 0, newson = 0, newact = 0;
__le32 *ref;
uint newblk;
@@ -375,7 +366,7 @@ int qtree_write_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
int type = dquot->dq_id.type;
struct super_block *sb = dquot->dq_sb;
ssize_t ret;
- char *ddquot = getdqbuf(info->dqi_entry_size);
+ char *ddquot = kmalloc(info->dqi_entry_size, GFP_NOFS);
if (!ddquot)
return -ENOMEM;
@@ -414,7 +405,7 @@ static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot,
uint blk)
{
struct qt_disk_dqdbheader *dh;
- char *buf = getdqbuf(info->dqi_usable_bs);
+ char *buf = kmalloc(info->dqi_usable_bs, GFP_NOFS);
int ret = 0;
if (!buf)
@@ -474,7 +465,7 @@ out_buf:
static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
uint *blk, int depth)
{
- char *buf = getdqbuf(info->dqi_usable_bs);
+ char *buf = kmalloc(info->dqi_usable_bs, GFP_NOFS);
int ret = 0;
uint newblk;
__le32 *ref = (__le32 *)buf;
@@ -533,7 +524,7 @@ EXPORT_SYMBOL(qtree_delete_dquot);
static loff_t find_block_dqentry(struct qtree_mem_dqinfo *info,
struct dquot *dquot, uint blk)
{
- char *buf = getdqbuf(info->dqi_usable_bs);
+ char *buf = kmalloc(info->dqi_usable_bs, GFP_NOFS);
loff_t ret = 0;
int i;
char *ddquot;
@@ -571,7 +562,7 @@ out_buf:
static loff_t find_tree_dqentry(struct qtree_mem_dqinfo *info,
struct dquot *dquot, uint blk, int depth)
{
- char *buf = getdqbuf(info->dqi_usable_bs);
+ char *buf = kmalloc(info->dqi_usable_bs, GFP_NOFS);
loff_t ret = 0;
__le32 *ref = (__le32 *)buf;
@@ -635,7 +626,7 @@ int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
}
dquot->dq_off = offset;
}
- ddquot = getdqbuf(info->dqi_entry_size);
+ ddquot = kmalloc(info->dqi_entry_size, GFP_NOFS);
if (!ddquot)
return -ENOMEM;
ret = sb->s_op->quota_read(sb, type, ddquot, info->dqi_entry_size,
@@ -679,7 +670,7 @@ EXPORT_SYMBOL(qtree_release_dquot);
static int find_next_id(struct qtree_mem_dqinfo *info, qid_t *id,
unsigned int blk, int depth)
{
- char *buf = getdqbuf(info->dqi_usable_bs);
+ char *buf = kmalloc(info->dqi_usable_bs, GFP_NOFS);
__le32 *ref = (__le32 *)buf;
ssize_t ret;
unsigned int epb = info->dqi_usable_bs >> 2;
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index 9ebd17d7befb..65e7e56005b8 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -53,13 +53,6 @@ struct ramfs_fs_info {
static const struct super_operations ramfs_ops;
static const struct inode_operations ramfs_dir_inode_operations;
-static const struct address_space_operations ramfs_aops = {
- .readpage = simple_readpage,
- .write_begin = simple_write_begin,
- .write_end = simple_write_end,
- .set_page_dirty = __set_page_dirty_no_writeback,
-};
-
struct inode *ramfs_get_inode(struct super_block *sb,
const struct inode *dir, umode_t mode, dev_t dev)
{
@@ -68,7 +61,7 @@ struct inode *ramfs_get_inode(struct super_block *sb,
if (inode) {
inode->i_ino = get_next_ino();
inode_init_owner(&init_user_ns, inode, dir, mode);
- inode->i_mapping->a_ops = &ramfs_aops;
+ inode->i_mapping->a_ops = &ram_aops;
mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
mapping_set_unevictable(inode->i_mapping);
inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 780bb90c1804..f49b72ccac4c 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -2584,9 +2584,7 @@ static int reiserfs_write_full_page(struct page *page,
clear_buffer_dirty(bh);
set_buffer_uptodate(bh);
} else if ((checked || buffer_dirty(bh)) &&
- (!buffer_mapped(bh) || (buffer_mapped(bh)
- && bh->b_blocknr ==
- 0))) {
+ (!buffer_mapped(bh) || bh->b_blocknr == 0)) {
/*
* not mapped yet, or it points to a direct item, search
* the btree for the mapping info, and log any direct
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 9edc8e2b154e..0834b101c316 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -2758,6 +2758,20 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
goto free_and_return;
}
+ /*
+ * Sanity check to see if journal first block is correct.
+ * If journal first block is invalid it can cause
+ * zeroing important superblock members.
+ */
+ if (!SB_ONDISK_JOURNAL_DEVICE(sb) &&
+ SB_ONDISK_JOURNAL_1st_BLOCK(sb) < SB_JOURNAL_1st_RESERVED_BLOCK(sb)) {
+ reiserfs_warning(sb, "journal-1393",
+ "journal 1st super block is invalid: 1st reserved block %d, but actual 1st block is %d",
+ SB_JOURNAL_1st_RESERVED_BLOCK(sb),
+ SB_ONDISK_JOURNAL_1st_BLOCK(sb));
+ goto free_and_return;
+ }
+
if (journal_init_dev(sb, journal, j_dev_name) != 0) {
reiserfs_warning(sb, "sh-462",
"unable to initialize journal device");
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 017db70d0f48..3d7a35d6a18b 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -132,6 +132,7 @@ int search_by_entry_key(struct super_block *sb, const struct cpu_key *key,
return IO_ERROR;
}
PATH_LAST_POSITION(path)--;
+ break;
case ITEM_FOUND:
break;
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 5059248f2d64..b117b212ef28 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -356,6 +356,31 @@ int seq_release(struct inode *inode, struct file *file)
EXPORT_SYMBOL(seq_release);
/**
+ * seq_escape_mem - print data into buffer, escaping some characters
+ * @m: target buffer
+ * @src: source buffer
+ * @len: size of source buffer
+ * @flags: flags to pass to string_escape_mem()
+ * @esc: set of characters that need escaping
+ *
+ * Puts data into buffer, replacing each occurrence of character from
+ * given class (defined by @flags and @esc) with printable escaped sequence.
+ *
+ * Use seq_has_overflowed() to check for errors.
+ */
+void seq_escape_mem(struct seq_file *m, const char *src, size_t len,
+ unsigned int flags, const char *esc)
+{
+ char *buf;
+ size_t size = seq_get_buf(m, &buf);
+ int ret;
+
+ ret = string_escape_mem(src, len, buf, size, flags, esc);
+ seq_commit(m, ret < size ? ret : -1);
+}
+EXPORT_SYMBOL(seq_escape_mem);
+
+/**
* seq_escape - print string into buffer, escaping some characters
* @m: target buffer
* @s: string
@@ -367,26 +392,10 @@ EXPORT_SYMBOL(seq_release);
*/
void seq_escape(struct seq_file *m, const char *s, const char *esc)
{
- char *buf;
- size_t size = seq_get_buf(m, &buf);
- int ret;
-
- ret = string_escape_str(s, buf, size, ESCAPE_OCTAL, esc);
- seq_commit(m, ret < size ? ret : -1);
+ seq_escape_str(m, s, ESCAPE_OCTAL, esc);
}
EXPORT_SYMBOL(seq_escape);
-void seq_escape_mem_ascii(struct seq_file *m, const char *src, size_t isz)
-{
- char *buf;
- size_t size = seq_get_buf(m, &buf);
- int ret;
-
- ret = string_escape_mem_ascii(src, isz, buf, size);
- seq_commit(m, ret < size ? ret : -1);
-}
-EXPORT_SYMBOL(seq_escape_mem_ascii);
-
void seq_vprintf(struct seq_file *m, const char *f, va_list args)
{
int len;
diff --git a/fs/signalfd.c b/fs/signalfd.c
index 040a1142915f..167b5889db4b 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -114,29 +114,24 @@ static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo,
break;
case SIL_FAULT_BNDERR:
case SIL_FAULT_PKUERR:
+ case SIL_PERF_EVENT:
/*
- * Fall through to the SIL_FAULT case. Both SIL_FAULT_BNDERR
- * and SIL_FAULT_PKUERR are only generated by faults that
- * deliver them synchronously to userspace. In case someone
- * injects one of these signals and signalfd catches it treat
- * it as SIL_FAULT.
+ * Fall through to the SIL_FAULT case. SIL_FAULT_BNDERR,
+ * SIL_FAULT_PKUERR, and SIL_PERF_EVENT are only
+ * generated by faults that deliver them synchronously to
+ * userspace. In case someone injects one of these signals
+ * and signalfd catches it treat it as SIL_FAULT.
*/
case SIL_FAULT:
new.ssi_addr = (long) kinfo->si_addr;
-#ifdef __ARCH_SI_TRAPNO
- new.ssi_trapno = kinfo->si_trapno;
-#endif
break;
- case SIL_FAULT_MCEERR:
+ case SIL_FAULT_TRAPNO:
new.ssi_addr = (long) kinfo->si_addr;
-#ifdef __ARCH_SI_TRAPNO
new.ssi_trapno = kinfo->si_trapno;
-#endif
- new.ssi_addr_lsb = (short) kinfo->si_addr_lsb;
break;
- case SIL_PERF_EVENT:
+ case SIL_FAULT_MCEERR:
new.ssi_addr = (long) kinfo->si_addr;
- new.ssi_perf = kinfo->si_perf;
+ new.ssi_addr_lsb = (short) kinfo->si_addr_lsb;
break;
case SIL_CHLD:
new.ssi_pid = kinfo->si_pid;
diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index b9e87ebb1060..855f0e87066d 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -226,8 +226,11 @@ out_free_bio:
bio_free_pages(bio);
bio_put(bio);
out:
- if (res < 0)
+ if (res < 0) {
ERROR("Failed to read block 0x%llx: %d\n", index, res);
+ if (msblk->panic_on_errors)
+ panic("squashfs read failed");
+ }
return res;
}
diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h
index 166e98806265..1e90c2575f9b 100644
--- a/fs/squashfs/squashfs_fs_sb.h
+++ b/fs/squashfs/squashfs_fs_sb.h
@@ -65,5 +65,6 @@ struct squashfs_sb_info {
unsigned int fragments;
int xattr_ids;
unsigned int ids;
+ bool panic_on_errors;
};
#endif
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 88cc94be1076..60d6951915f4 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -18,9 +18,11 @@
#include <linux/fs.h>
#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
#include <linux/vfs.h>
#include <linux/slab.h>
#include <linux/mutex.h>
+#include <linux/seq_file.h>
#include <linux/pagemap.h>
#include <linux/init.h>
#include <linux/module.h>
@@ -37,6 +39,51 @@
static struct file_system_type squashfs_fs_type;
static const struct super_operations squashfs_super_ops;
+enum Opt_errors {
+ Opt_errors_continue,
+ Opt_errors_panic,
+};
+
+enum squashfs_param {
+ Opt_errors,
+};
+
+struct squashfs_mount_opts {
+ enum Opt_errors errors;
+};
+
+static const struct constant_table squashfs_param_errors[] = {
+ {"continue", Opt_errors_continue },
+ {"panic", Opt_errors_panic },
+ {}
+};
+
+static const struct fs_parameter_spec squashfs_fs_parameters[] = {
+ fsparam_enum("errors", Opt_errors, squashfs_param_errors),
+ {}
+};
+
+static int squashfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
+{
+ struct squashfs_mount_opts *opts = fc->fs_private;
+ struct fs_parse_result result;
+ int opt;
+
+ opt = fs_parse(fc, squashfs_fs_parameters, param, &result);
+ if (opt < 0)
+ return opt;
+
+ switch (opt) {
+ case Opt_errors:
+ opts->errors = result.uint_32;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
static const struct squashfs_decompressor *supported_squashfs_filesystem(
struct fs_context *fc,
short major, short minor, short id)
@@ -67,6 +114,7 @@ static const struct squashfs_decompressor *supported_squashfs_filesystem(
static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc)
{
+ struct squashfs_mount_opts *opts = fc->fs_private;
struct squashfs_sb_info *msblk;
struct squashfs_super_block *sblk = NULL;
struct inode *root;
@@ -85,6 +133,8 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc)
}
msblk = sb->s_fs_info;
+ msblk->panic_on_errors = (opts->errors == Opt_errors_panic);
+
msblk->devblksize = sb_min_blocksize(sb, SQUASHFS_DEVBLK_SIZE);
msblk->devblksize_log2 = ffz(~msblk->devblksize);
@@ -350,18 +400,52 @@ static int squashfs_get_tree(struct fs_context *fc)
static int squashfs_reconfigure(struct fs_context *fc)
{
+ struct super_block *sb = fc->root->d_sb;
+ struct squashfs_sb_info *msblk = sb->s_fs_info;
+ struct squashfs_mount_opts *opts = fc->fs_private;
+
sync_filesystem(fc->root->d_sb);
fc->sb_flags |= SB_RDONLY;
+
+ msblk->panic_on_errors = (opts->errors == Opt_errors_panic);
+
return 0;
}
+static void squashfs_free_fs_context(struct fs_context *fc)
+{
+ kfree(fc->fs_private);
+}
+
static const struct fs_context_operations squashfs_context_ops = {
.get_tree = squashfs_get_tree,
+ .free = squashfs_free_fs_context,
+ .parse_param = squashfs_parse_param,
.reconfigure = squashfs_reconfigure,
};
+static int squashfs_show_options(struct seq_file *s, struct dentry *root)
+{
+ struct super_block *sb = root->d_sb;
+ struct squashfs_sb_info *msblk = sb->s_fs_info;
+
+ if (msblk->panic_on_errors)
+ seq_puts(s, ",errors=panic");
+ else
+ seq_puts(s, ",errors=continue");
+
+ return 0;
+}
+
static int squashfs_init_fs_context(struct fs_context *fc)
{
+ struct squashfs_mount_opts *opts;
+
+ opts = kzalloc(sizeof(*opts), GFP_KERNEL);
+ if (!opts)
+ return -ENOMEM;
+
+ fc->fs_private = opts;
fc->ops = &squashfs_context_ops;
return 0;
}
@@ -481,6 +565,7 @@ static struct file_system_type squashfs_fs_type = {
.owner = THIS_MODULE,
.name = "squashfs",
.init_fs_context = squashfs_init_fs_context,
+ .parameters = squashfs_fs_parameters,
.kill_sb = kill_block_super,
.fs_flags = FS_REQUIRES_DEV
};
@@ -491,6 +576,7 @@ static const struct super_operations squashfs_super_ops = {
.free_inode = squashfs_free_inode,
.statfs = squashfs_statfs,
.put_super = squashfs_put_super,
+ .show_options = squashfs_show_options,
};
module_init(init_squashfs_fs);
diff --git a/fs/super.c b/fs/super.c
index 11b7e7213fd1..91b7f156735b 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1277,9 +1277,9 @@ int get_tree_bdev(struct fs_context *fc,
}
/*
- * s_umount nests inside bd_mutex during
+ * s_umount nests inside open_mutex during
* __invalidate_device(). blkdev_put() acquires
- * bd_mutex and can't be called under s_umount. Drop
+ * open_mutex and can't be called under s_umount. Drop
* s_umount temporarily. This is safe as we're
* holding an active reference.
*/
@@ -1352,9 +1352,9 @@ struct dentry *mount_bdev(struct file_system_type *fs_type,
}
/*
- * s_umount nests inside bd_mutex during
+ * s_umount nests inside open_mutex during
* __invalidate_device(). blkdev_put() acquires
- * bd_mutex and can't be called under s_umount. Drop
+ * open_mutex and can't be called under s_umount. Drop
* s_umount temporarily. This is safe as we're
* holding an active reference.
*/
diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c
index 8b2e99b7bc9f..749385015a8d 100644
--- a/fs/sysv/itree.c
+++ b/fs/sysv/itree.c
@@ -495,6 +495,7 @@ static sector_t sysv_bmap(struct address_space *mapping, sector_t block)
}
const struct address_space_operations sysv_aops = {
+ .set_page_dirty = __set_page_dirty_buffers,
.readpage = sysv_readpage,
.writepage = sysv_writepage,
.write_begin = sysv_write_begin,
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 2846dcd92197..1baff8ddb754 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -125,6 +125,7 @@ static int udf_adinicb_write_end(struct file *file, struct address_space *mappin
}
const struct address_space_operations udf_adinicb_aops = {
+ .set_page_dirty = __set_page_dirty_buffers,
.readpage = udf_adinicb_readpage,
.writepage = udf_adinicb_writepage,
.write_begin = udf_adinicb_write_begin,
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 0dd2f93ac048..4917670860a0 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -235,6 +235,7 @@ static sector_t udf_bmap(struct address_space *mapping, sector_t block)
}
const struct address_space_operations udf_aops = {
+ .set_page_dirty = __set_page_dirty_buffers,
.readpage = udf_readpage,
.readahead = udf_readahead,
.writepage = udf_writepage,
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 3ae9f1e91984..7c7c9bbbfa57 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -934,6 +934,10 @@ static int udf_symlink(struct user_namespace *mnt_userns, struct inode *dir,
iinfo->i_location.partitionReferenceNum,
0);
epos.bh = udf_tgetblk(sb, block);
+ if (unlikely(!epos.bh)) {
+ err = -ENOMEM;
+ goto out_no_entry;
+ }
lock_buffer(epos.bh);
memset(epos.bh->b_data, 0x00, bsize);
set_buffer_uptodate(epos.bh);
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index debc282c1bb4..ac628de69601 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -526,6 +526,7 @@ static sector_t ufs_bmap(struct address_space *mapping, sector_t block)
}
const struct address_space_operations ufs_aops = {
+ .set_page_dirty = __set_page_dirty_buffers,
.readpage = ufs_readpage,
.writepage = ufs_writepage,
.write_begin = ufs_write_begin,
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 14f92285d04f..f6e0f0c0d0e5 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -337,7 +337,7 @@ out:
return ret;
}
-static inline long userfaultfd_get_blocking_state(unsigned int flags)
+static inline unsigned int userfaultfd_get_blocking_state(unsigned int flags)
{
if (flags & FAULT_FLAG_INTERRUPTIBLE)
return TASK_INTERRUPTIBLE;
@@ -370,7 +370,7 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
struct userfaultfd_wait_queue uwq;
vm_fault_t ret = VM_FAULT_SIGBUS;
bool must_wait;
- long blocking_state;
+ unsigned int blocking_state;
/*
* We don't do userfault handling for the final child pid update.
@@ -1267,8 +1267,7 @@ static inline bool vma_can_userfault(struct vm_area_struct *vma,
}
if (vm_flags & VM_UFFD_MINOR) {
- /* FIXME: Add minor fault interception for shmem. */
- if (!is_vm_hugetlb_page(vma))
+ if (!(is_vm_hugetlb_page(vma) || vma_is_shmem(vma)))
return false;
}
@@ -1304,8 +1303,12 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
vm_flags = 0;
if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING)
vm_flags |= VM_UFFD_MISSING;
- if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP)
+ if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) {
+#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_WP
+ goto out;
+#endif
vm_flags |= VM_UFFD_WP;
+ }
if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR) {
#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
goto out;
@@ -1941,7 +1944,11 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
/* report all available features and ioctls to userland */
uffdio_api.features = UFFD_API_FEATURES;
#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
- uffdio_api.features &= ~UFFD_FEATURE_MINOR_HUGETLBFS;
+ uffdio_api.features &=
+ ~(UFFD_FEATURE_MINOR_HUGETLBFS | UFFD_FEATURE_MINOR_SHMEM);
+#endif
+#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_WP
+ uffdio_api.features &= ~UFFD_FEATURE_PAGEFAULT_FLAG_WP;
#endif
uffdio_api.ioctls = UFFD_API_IOCTLS;
ret = -EFAULT;
diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c
index e32a1833d523..e0d5cdc57cc2 100644
--- a/fs/xfs/libxfs/xfs_ag_resv.c
+++ b/fs/xfs/libxfs/xfs_ag_resv.c
@@ -325,10 +325,22 @@ out:
error2 = xfs_alloc_pagf_init(mp, tp, pag->pag_agno, 0);
if (error2)
return error2;
- ASSERT(xfs_perag_resv(pag, XFS_AG_RESV_METADATA)->ar_reserved +
- xfs_perag_resv(pag, XFS_AG_RESV_RMAPBT)->ar_reserved <=
- pag->pagf_freeblks + pag->pagf_flcount);
+
+ /*
+ * If there isn't enough space in the AG to satisfy the
+ * reservation, let the caller know that there wasn't enough
+ * space. Callers are responsible for deciding what to do
+ * next, since (in theory) we can stumble along with
+ * insufficient reservation if data blocks are being freed to
+ * replenish the AG's free space.
+ */
+ if (!error &&
+ xfs_perag_resv(pag, XFS_AG_RESV_METADATA)->ar_reserved +
+ xfs_perag_resv(pag, XFS_AG_RESV_RMAPBT)->ar_reserved >
+ pag->pagf_freeblks + pag->pagf_flcount)
+ error = -ENOSPC;
}
+
return error;
}
@@ -354,7 +366,7 @@ xfs_ag_resv_alloc_extent(
break;
default:
ASSERT(0);
- /* fall through */
+ fallthrough;
case XFS_AG_RESV_NONE:
field = args->wasdel ? XFS_TRANS_SB_RES_FDBLOCKS :
XFS_TRANS_SB_FDBLOCKS;
@@ -396,7 +408,7 @@ xfs_ag_resv_free_extent(
break;
default:
ASSERT(0);
- /* fall through */
+ fallthrough;
case XFS_AG_RESV_NONE:
xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (int64_t)len);
return;
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 82b7cbb1f24f..af3d5f9271f6 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -3174,7 +3174,7 @@ xfs_alloc_vextent(
}
args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
args->type = XFS_ALLOCTYPE_NEAR_BNO;
- /* FALLTHROUGH */
+ fallthrough;
case XFS_ALLOCTYPE_FIRST_AG:
/*
* Rotate through the allocation groups looking for a winner.
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 7e3b9b01431e..a3e0e6f672d6 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -605,7 +605,6 @@ xfs_bmap_btree_to_extents(
ASSERT(cur);
ASSERT(whichfork != XFS_COW_FORK);
- ASSERT(!xfs_need_iread_extents(ifp));
ASSERT(ifp->if_format == XFS_DINODE_FMT_BTREE);
ASSERT(be16_to_cpu(rblock->bb_level) == 1);
ASSERT(be16_to_cpu(rblock->bb_numrecs) == 1);
@@ -5350,7 +5349,6 @@ __xfs_bunmapi(
xfs_fsblock_t sum;
xfs_filblks_t len = *rlen; /* length to unmap in file */
xfs_fileoff_t max_len;
- xfs_agnumber_t prev_agno = NULLAGNUMBER, agno;
xfs_fileoff_t end;
struct xfs_iext_cursor icur;
bool done = false;
@@ -5442,16 +5440,6 @@ __xfs_bunmapi(
del = got;
wasdel = isnullstartblock(del.br_startblock);
- /*
- * Make sure we don't touch multiple AGF headers out of order
- * in a single transaction, as that could cause AB-BA deadlocks.
- */
- if (!wasdel && !isrt) {
- agno = XFS_FSB_TO_AGNO(mp, del.br_startblock);
- if (prev_agno != NULLAGNUMBER && prev_agno > agno)
- break;
- prev_agno = agno;
- }
if (got.br_startoff < start) {
del.br_startoff = start;
del.br_blockcount -= start - got.br_startoff;
diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c
index 83ac9771bfb5..747ec77912c3 100644
--- a/fs/xfs/libxfs/xfs_da_btree.c
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -282,7 +282,7 @@ xfs_da3_node_read_verify(
__this_address);
break;
}
- /* fall through */
+ fallthrough;
case XFS_DA_NODE_MAGIC:
fa = xfs_da3_node_verify(bp);
if (fa)
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index a83bdd0c47a8..bde2b4c64dbe 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -770,6 +770,8 @@ struct xfs_scrub_metadata {
/*
* ioctl commands that are used by Linux filesystems
*/
+#define XFS_IOC_GETXFLAGS FS_IOC_GETFLAGS
+#define XFS_IOC_SETXFLAGS FS_IOC_SETFLAGS
#define XFS_IOC_GETVERSION FS_IOC_GETVERSION
/*
@@ -780,6 +782,8 @@ struct xfs_scrub_metadata {
#define XFS_IOC_ALLOCSP _IOW ('X', 10, struct xfs_flock64)
#define XFS_IOC_FREESP _IOW ('X', 11, struct xfs_flock64)
#define XFS_IOC_DIOINFO _IOR ('X', 30, struct dioattr)
+#define XFS_IOC_FSGETXATTR FS_IOC_FSGETXATTR
+#define XFS_IOC_FSSETXATTR FS_IOC_FSSETXATTR
#define XFS_IOC_ALLOCSP64 _IOW ('X', 36, struct xfs_flock64)
#define XFS_IOC_FREESP64 _IOW ('X', 37, struct xfs_flock64)
#define XFS_IOC_GETBMAP _IOWR('X', 38, struct getbmap)
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index 5c9a7440d9e4..f3254a4f4cb4 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -559,8 +559,17 @@ xfs_dinode_calc_crc(
/*
* Validate di_extsize hint.
*
- * The rules are documented at xfs_ioctl_setattr_check_extsize().
- * These functions must be kept in sync with each other.
+ * 1. Extent size hint is only valid for directories and regular files.
+ * 2. FS_XFLAG_EXTSIZE is only valid for regular files.
+ * 3. FS_XFLAG_EXTSZINHERIT is only valid for directories.
+ * 4. Hint cannot be larger than MAXTEXTLEN.
+ * 5. Can be changed on directories at any time.
+ * 6. Hint value of 0 turns off hints, clears inode flags.
+ * 7. Extent size must be a multiple of the appropriate block size.
+ * For realtime files, this is the rt extent size.
+ * 8. For non-realtime files, the extent size hint must be limited
+ * to half the AG size to avoid alignment extending the extent beyond the
+ * limits of the AG.
*/
xfs_failaddr_t
xfs_inode_validate_extsize(
@@ -580,6 +589,28 @@ xfs_inode_validate_extsize(
inherit_flag = (flags & XFS_DIFLAG_EXTSZINHERIT);
extsize_bytes = XFS_FSB_TO_B(mp, extsize);
+ /*
+ * This comment describes a historic gap in this verifier function.
+ *
+ * On older kernels, the extent size hint verifier doesn't check that
+ * the extent size hint is an integer multiple of the realtime extent
+ * size on a directory with both RTINHERIT and EXTSZINHERIT flags set.
+ * The verifier has always enforced the alignment rule for regular
+ * files with the REALTIME flag set.
+ *
+ * If a directory with a misaligned extent size hint is allowed to
+ * propagate that hint into a new regular realtime file, the result
+ * is that the inode cluster buffer verifier will trigger a corruption
+ * shutdown the next time it is run.
+ *
+ * Unfortunately, there could be filesystems with these misconfigured
+ * directories in the wild, so we cannot add a check to this verifier
+ * at this time because that will result a new source of directory
+ * corruption errors when reading an existing filesystem. Instead, we
+ * permit the misconfiguration to pass through the verifiers so that
+ * callers of this function can correct and mitigate externally.
+ */
+
if (rt_flag)
blocksize_bytes = mp->m_sb.sb_rextsize << mp->m_sb.sb_blocklog;
else
@@ -616,8 +647,15 @@ xfs_inode_validate_extsize(
/*
* Validate di_cowextsize hint.
*
- * The rules are documented at xfs_ioctl_setattr_check_cowextsize().
- * These functions must be kept in sync with each other.
+ * 1. CoW extent size hint can only be set if reflink is enabled on the fs.
+ * The inode does not have to have any shared blocks, but it must be a v3.
+ * 2. FS_XFLAG_COWEXTSIZE is only valid for directories and regular files;
+ * for a directory, the hint is propagated to new files.
+ * 3. Can be changed on files & directories at any time.
+ * 4. Hint value of 0 turns off hints, clears inode flags.
+ * 5. Extent size must be a multiple of the appropriate block size.
+ * 6. The extent size hint must be limited to half the AG size to avoid
+ * alignment extending the extent beyond the limits of the AG.
*/
xfs_failaddr_t
xfs_inode_validate_cowextsize(
diff --git a/fs/xfs/libxfs/xfs_trans_inode.c b/fs/xfs/libxfs/xfs_trans_inode.c
index 78324e043e25..8d595a5c4abd 100644
--- a/fs/xfs/libxfs/xfs_trans_inode.c
+++ b/fs/xfs/libxfs/xfs_trans_inode.c
@@ -143,6 +143,23 @@ xfs_trans_log_inode(
}
/*
+ * Inode verifiers on older kernels don't check that the extent size
+ * hint is an integer multiple of the rt extent size on a directory
+ * with both rtinherit and extszinherit flags set. If we're logging a
+ * directory that is misconfigured in this way, clear the hint.
+ */
+ if ((ip->i_diflags & XFS_DIFLAG_RTINHERIT) &&
+ (ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) &&
+ (ip->i_extsize % ip->i_mount->m_sb.sb_rextsize) > 0) {
+ xfs_info_once(ip->i_mount,
+ "Correcting misaligned extent size hint in inode 0x%llx.", ip->i_ino);
+ ip->i_diflags &= ~(XFS_DIFLAG_EXTSIZE |
+ XFS_DIFLAG_EXTSZINHERIT);
+ ip->i_extsize = 0;
+ flags |= XFS_ILOG_CORE;
+ }
+
+ /*
* Record the specific change for fdatasync optimisation. This allows
* fdatasync to skip log forces for inodes that are only timestamp
* dirty.
diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c
index 7a2f9b5f2db5..f96e84793cc9 100644
--- a/fs/xfs/scrub/agheader.c
+++ b/fs/xfs/scrub/agheader.c
@@ -86,6 +86,7 @@ xchk_superblock(
case -ENOSYS:
case -EFBIG:
error = -EFSCORRUPTED;
+ fallthrough;
default:
break;
}
diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
index b5ebf1d1b4db..77d5c4a0f09f 100644
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c
@@ -271,7 +271,7 @@ xchk_bmap_iextent_xref(
case XFS_DATA_FORK:
if (xfs_is_reflink_inode(info->sc->ip))
break;
- /* fall through */
+ fallthrough;
case XFS_ATTR_FORK:
xchk_xref_is_not_shared(info->sc, agbno,
irec->br_blockcount);
diff --git a/fs/xfs/scrub/btree.c b/fs/xfs/scrub/btree.c
index a94bd8122c60..bd1172358964 100644
--- a/fs/xfs/scrub/btree.c
+++ b/fs/xfs/scrub/btree.c
@@ -44,7 +44,7 @@ __xchk_btree_process_error(
/* Note the badness but don't abort. */
sc->sm->sm_flags |= errflag;
*error = 0;
- /* fall through */
+ fallthrough;
default:
if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
trace_xchk_ifork_btree_op_error(sc, cur, level,
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
index aa874607618a..6cc92291c46f 100644
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@@ -74,14 +74,16 @@ __xchk_process_error(
return true;
case -EDEADLOCK:
/* Used to restart an op with deadlock avoidance. */
- trace_xchk_deadlock_retry(sc->ip, sc->sm, *error);
+ trace_xchk_deadlock_retry(
+ sc->ip ? sc->ip : XFS_I(file_inode(sc->file)),
+ sc->sm, *error);
break;
case -EFSBADCRC:
case -EFSCORRUPTED:
/* Note the badness but don't abort. */
sc->sm->sm_flags |= errflag;
*error = 0;
- /* fall through */
+ fallthrough;
default:
trace_xchk_op_error(sc, agno, bno, *error,
ret_ip);
@@ -134,7 +136,7 @@ __xchk_fblock_process_error(
/* Note the badness but don't abort. */
sc->sm->sm_flags |= errflag;
*error = 0;
- /* fall through */
+ fallthrough;
default:
trace_xchk_file_op_error(sc, whichfork, offset, *error,
ret_ip);
@@ -694,7 +696,7 @@ xchk_get_inode(
if (error)
return -ENOENT;
error = -EFSCORRUPTED;
- /* fall through */
+ fallthrough;
default:
trace_xchk_op_error(sc,
XFS_INO_TO_AGNO(mp, sc->sm->sm_ino),
diff --git a/fs/xfs/scrub/dabtree.c b/fs/xfs/scrub/dabtree.c
index 653f3280e1c1..9f0dbb47c82c 100644
--- a/fs/xfs/scrub/dabtree.c
+++ b/fs/xfs/scrub/dabtree.c
@@ -47,7 +47,7 @@ xchk_da_process_error(
/* Note the badness but don't abort. */
sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
*error = 0;
- /* fall through */
+ fallthrough;
default:
trace_xchk_file_op_error(sc, ds->dargs.whichfork,
xfs_dir2_da_to_db(ds->dargs.geo,
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index c2857d854c83..b8202dd08939 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -947,7 +947,7 @@ xrep_ino_dqattach(
xrep_force_quotacheck(sc, XFS_DQTYPE_GROUP);
if (XFS_IS_PQUOTA_ON(sc->mp) && !sc->ip->i_pdquot)
xrep_force_quotacheck(sc, XFS_DQTYPE_PROJ);
- /* fall through */
+ fallthrough;
case -ESRCH:
error = 0;
break;
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 826caa6b4a5a..cb4e0fcf4c76 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -561,7 +561,7 @@ const struct address_space_operations xfs_address_space_operations = {
.readahead = xfs_vm_readahead,
.writepage = xfs_vm_writepage,
.writepages = xfs_vm_writepages,
- .set_page_dirty = iomap_set_page_dirty,
+ .set_page_dirty = __set_page_dirty_nobuffers,
.releasepage = iomap_releasepage,
.invalidatepage = iomap_invalidatepage,
.bmap = xfs_vm_bmap,
@@ -575,7 +575,7 @@ const struct address_space_operations xfs_address_space_operations = {
const struct address_space_operations xfs_dax_aops = {
.writepages = xfs_dax_writepages,
.direct_IO = noop_direct_IO,
- .set_page_dirty = noop_set_page_dirty,
+ .set_page_dirty = __set_page_dirty_no_writeback,
.invalidatepage = noop_invalidatepage,
.swap_activate = xfs_iomap_swapfile_activate,
};
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index a5e9d7d34023..2988efa97e14 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -71,18 +71,24 @@ xfs_zero_extent(
#ifdef CONFIG_XFS_RT
int
xfs_bmap_rtalloc(
- struct xfs_bmalloca *ap) /* bmap alloc argument struct */
+ struct xfs_bmalloca *ap)
{
- int error; /* error return value */
- xfs_mount_t *mp; /* mount point structure */
- xfs_extlen_t prod = 0; /* product factor for allocators */
- xfs_extlen_t mod = 0; /* product factor for allocators */
- xfs_extlen_t ralen = 0; /* realtime allocation length */
- xfs_extlen_t align; /* minimum allocation alignment */
- xfs_rtblock_t rtb;
-
- mp = ap->ip->i_mount;
+ struct xfs_mount *mp = ap->ip->i_mount;
+ xfs_fileoff_t orig_offset = ap->offset;
+ xfs_rtblock_t rtb;
+ xfs_extlen_t prod = 0; /* product factor for allocators */
+ xfs_extlen_t mod = 0; /* product factor for allocators */
+ xfs_extlen_t ralen = 0; /* realtime allocation length */
+ xfs_extlen_t align; /* minimum allocation alignment */
+ xfs_extlen_t orig_length = ap->length;
+ xfs_extlen_t minlen = mp->m_sb.sb_rextsize;
+ xfs_extlen_t raminlen;
+ bool rtlocked = false;
+ bool ignore_locality = false;
+ int error;
+
align = xfs_get_extsz_hint(ap->ip);
+retry:
prod = align / mp->m_sb.sb_rextsize;
error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev,
align, 1, ap->eof, 0,
@@ -93,6 +99,15 @@ xfs_bmap_rtalloc(
ASSERT(ap->length % mp->m_sb.sb_rextsize == 0);
/*
+ * If we shifted the file offset downward to satisfy an extent size
+ * hint, increase minlen by that amount so that the allocator won't
+ * give us an allocation that's too short to cover at least one of the
+ * blocks that the caller asked for.
+ */
+ if (ap->offset != orig_offset)
+ minlen += orig_offset - ap->offset;
+
+ /*
* If the offset & length are not perfectly aligned
* then kill prod, it will just get us in trouble.
*/
@@ -116,10 +131,13 @@ xfs_bmap_rtalloc(
/*
* Lock out modifications to both the RT bitmap and summary inodes
*/
- xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL|XFS_ILOCK_RTBITMAP);
- xfs_trans_ijoin(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL);
- xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL|XFS_ILOCK_RTSUM);
- xfs_trans_ijoin(ap->tp, mp->m_rsumip, XFS_ILOCK_EXCL);
+ if (!rtlocked) {
+ xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL|XFS_ILOCK_RTBITMAP);
+ xfs_trans_ijoin(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL);
+ xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL|XFS_ILOCK_RTSUM);
+ xfs_trans_ijoin(ap->tp, mp->m_rsumip, XFS_ILOCK_EXCL);
+ rtlocked = true;
+ }
/*
* If it's an allocation to an empty file at offset 0,
@@ -141,33 +159,59 @@ xfs_bmap_rtalloc(
/*
* Realtime allocation, done through xfs_rtallocate_extent.
*/
- do_div(ap->blkno, mp->m_sb.sb_rextsize);
+ if (ignore_locality)
+ ap->blkno = 0;
+ else
+ do_div(ap->blkno, mp->m_sb.sb_rextsize);
rtb = ap->blkno;
ap->length = ralen;
- error = xfs_rtallocate_extent(ap->tp, ap->blkno, 1, ap->length,
- &ralen, ap->wasdel, prod, &rtb);
+ raminlen = max_t(xfs_extlen_t, 1, minlen / mp->m_sb.sb_rextsize);
+ error = xfs_rtallocate_extent(ap->tp, ap->blkno, raminlen, ap->length,
+ &ralen, ap->wasdel, prod, &rtb);
if (error)
return error;
- ap->blkno = rtb;
- if (ap->blkno != NULLFSBLOCK) {
- ap->blkno *= mp->m_sb.sb_rextsize;
- ralen *= mp->m_sb.sb_rextsize;
- ap->length = ralen;
- ap->ip->i_nblocks += ralen;
+ if (rtb != NULLRTBLOCK) {
+ ap->blkno = rtb * mp->m_sb.sb_rextsize;
+ ap->length = ralen * mp->m_sb.sb_rextsize;
+ ap->ip->i_nblocks += ap->length;
xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
if (ap->wasdel)
- ap->ip->i_delayed_blks -= ralen;
+ ap->ip->i_delayed_blks -= ap->length;
/*
* Adjust the disk quota also. This was reserved
* earlier.
*/
xfs_trans_mod_dquot_byino(ap->tp, ap->ip,
ap->wasdel ? XFS_TRANS_DQ_DELRTBCOUNT :
- XFS_TRANS_DQ_RTBCOUNT, (long) ralen);
- } else {
- ap->length = 0;
+ XFS_TRANS_DQ_RTBCOUNT, ap->length);
+ return 0;
}
+
+ if (align > mp->m_sb.sb_rextsize) {
+ /*
+ * We previously enlarged the request length to try to satisfy
+ * an extent size hint. The allocator didn't return anything,
+ * so reset the parameters to the original values and try again
+ * without alignment criteria.
+ */
+ ap->offset = orig_offset;
+ ap->length = orig_length;
+ minlen = align = mp->m_sb.sb_rextsize;
+ goto retry;
+ }
+
+ if (!ignore_locality && ap->blkno != 0) {
+ /*
+ * If we can't allocate near a specific rt extent, try again
+ * without locality criteria.
+ */
+ ignore_locality = true;
+ goto retry;
+ }
+
+ ap->blkno = NULLFSBLOCK;
+ ap->length = 0;
return 0;
}
#endif /* CONFIG_XFS_RT */
@@ -242,7 +286,7 @@ xfs_bmap_count_blocks(
*/
*count += btblocks - 1;
- /* fall through */
+ fallthrough;
case XFS_DINODE_FMT_EXTENTS:
*nextents = xfs_bmap_count_leaves(ifp, count);
break;
diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c
index 465fd9e048d4..1da59bdff245 100644
--- a/fs/xfs/xfs_export.c
+++ b/fs/xfs/xfs_export.c
@@ -84,7 +84,7 @@ xfs_fs_encode_fh(
case FILEID_INO32_GEN_PARENT:
fid->i32.parent_ino = XFS_I(parent)->i_ino;
fid->i32.parent_gen = parent->i_generation;
- /*FALLTHRU*/
+ fallthrough;
case FILEID_INO32_GEN:
fid->i32.ino = XFS_I(inode)->i_ino;
fid->i32.gen = inode->i_generation;
@@ -92,7 +92,7 @@ xfs_fs_encode_fh(
case FILEID_INO32_GEN_PARENT | XFS_FILEID_TYPE_64FLAG:
fid64->parent_ino = XFS_I(parent)->i_ino;
fid64->parent_gen = parent->i_generation;
- /*FALLTHRU*/
+ fallthrough;
case FILEID_INO32_GEN | XFS_FILEID_TYPE_64FLAG:
fid64->ino = XFS_I(inode)->i_ino;
fid64->gen = inode->i_generation;
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 396ef36dcd0a..3c0749ab9e40 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -863,7 +863,7 @@ xfs_break_layouts(
error = xfs_break_dax_layouts(inode, &retry);
if (error || retry)
break;
- /* fall through */
+ fallthrough;
case BREAK_WRITE:
error = xfs_break_leased_layouts(inode, iolock, &retry);
break;
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 0369eb22c1bb..1db99a909b23 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -690,6 +690,7 @@ xfs_inode_inherit_flags(
const struct xfs_inode *pip)
{
unsigned int di_flags = 0;
+ xfs_failaddr_t failaddr;
umode_t mode = VFS_I(ip)->i_mode;
if (S_ISDIR(mode)) {
@@ -729,6 +730,24 @@ xfs_inode_inherit_flags(
di_flags |= XFS_DIFLAG_FILESTREAM;
ip->i_diflags |= di_flags;
+
+ /*
+ * Inode verifiers on older kernels only check that the extent size
+ * hint is an integer multiple of the rt extent size on realtime files.
+ * They did not check the hint alignment on a directory with both
+ * rtinherit and extszinherit flags set. If the misaligned hint is
+ * propagated from a directory into a new realtime file, new file
+ * allocations will fail due to math errors in the rt allocator and/or
+ * trip the verifiers. Validate the hint settings in the new file so
+ * that we don't let broken hints propagate.
+ */
+ failaddr = xfs_inode_validate_extsize(ip->i_mount, ip->i_extsize,
+ VFS_I(ip)->i_mode, ip->i_diflags);
+ if (failaddr) {
+ ip->i_diflags &= ~(XFS_DIFLAG_EXTSIZE |
+ XFS_DIFLAG_EXTSZINHERIT);
+ ip->i_extsize = 0;
+ }
}
/* Propagate di_flags2 from a parent inode to a child inode. */
@@ -737,12 +756,22 @@ xfs_inode_inherit_flags2(
struct xfs_inode *ip,
const struct xfs_inode *pip)
{
+ xfs_failaddr_t failaddr;
+
if (pip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) {
ip->i_diflags2 |= XFS_DIFLAG2_COWEXTSIZE;
ip->i_cowextsize = pip->i_cowextsize;
}
if (pip->i_diflags2 & XFS_DIFLAG2_DAX)
ip->i_diflags2 |= XFS_DIFLAG2_DAX;
+
+ /* Don't let invalid cowextsize hints propagate. */
+ failaddr = xfs_inode_validate_cowextsize(ip->i_mount, ip->i_cowextsize,
+ VFS_I(ip)->i_mode, ip->i_diflags, ip->i_diflags2);
+ if (failaddr) {
+ ip->i_diflags2 &= ~XFS_DIFLAG2_COWEXTSIZE;
+ ip->i_cowextsize = 0;
+ }
}
/*
@@ -848,7 +877,7 @@ xfs_init_new_inode(
xfs_inode_inherit_flags(ip, pip);
if (pip && (pip->i_diflags2 & XFS_DIFLAG2_ANY))
xfs_inode_inherit_flags2(ip, pip);
- /* FALLTHROUGH */
+ fallthrough;
case S_IFLNK:
ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS;
ip->i_df.if_bytes = 0;
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 3925bfcb2365..7b1796cede10 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -558,7 +558,7 @@ xfs_ioc_attrmulti_one(
case ATTR_OP_REMOVE:
value = NULL;
*len = 0;
- /* fall through */
+ fallthrough;
case ATTR_OP_SET:
error = mnt_want_write_file(parfilp);
if (error)
@@ -1267,20 +1267,8 @@ out_error:
}
/*
- * extent size hint validation is somewhat cumbersome. Rules are:
- *
- * 1. extent size hint is only valid for directories and regular files
- * 2. FS_XFLAG_EXTSIZE is only valid for regular files
- * 3. FS_XFLAG_EXTSZINHERIT is only valid for directories.
- * 4. can only be changed on regular files if no extents are allocated
- * 5. can be changed on directories at any time
- * 6. extsize hint of 0 turns off hints, clears inode flags.
- * 7. Extent size must be a multiple of the appropriate block size.
- * 8. for non-realtime files, the extent size hint must be limited
- * to half the AG size to avoid alignment extending the extent beyond the
- * limits of the AG.
- *
- * Please keep this function in sync with xfs_scrub_inode_extsize.
+ * Validate a proposed extent size hint. For regular files, the hint can only
+ * be changed if no extents are allocated.
*/
static int
xfs_ioctl_setattr_check_extsize(
@@ -1288,86 +1276,65 @@ xfs_ioctl_setattr_check_extsize(
struct fileattr *fa)
{
struct xfs_mount *mp = ip->i_mount;
- xfs_extlen_t size;
- xfs_fsblock_t extsize_fsb;
+ xfs_failaddr_t failaddr;
+ uint16_t new_diflags;
if (!fa->fsx_valid)
return 0;
if (S_ISREG(VFS_I(ip)->i_mode) && ip->i_df.if_nextents &&
- ((ip->i_extsize << mp->m_sb.sb_blocklog) != fa->fsx_extsize))
+ XFS_FSB_TO_B(mp, ip->i_extsize) != fa->fsx_extsize)
return -EINVAL;
- if (fa->fsx_extsize == 0)
- return 0;
-
- extsize_fsb = XFS_B_TO_FSB(mp, fa->fsx_extsize);
- if (extsize_fsb > MAXEXTLEN)
+ if (fa->fsx_extsize & mp->m_blockmask)
return -EINVAL;
- if (XFS_IS_REALTIME_INODE(ip) ||
- (fa->fsx_xflags & FS_XFLAG_REALTIME)) {
- size = mp->m_sb.sb_rextsize << mp->m_sb.sb_blocklog;
- } else {
- size = mp->m_sb.sb_blocksize;
- if (extsize_fsb > mp->m_sb.sb_agblocks / 2)
+ new_diflags = xfs_flags2diflags(ip, fa->fsx_xflags);
+
+ /*
+ * Inode verifiers on older kernels don't check that the extent size
+ * hint is an integer multiple of the rt extent size on a directory
+ * with both rtinherit and extszinherit flags set. Don't let sysadmins
+ * misconfigure directories.
+ */
+ if ((new_diflags & XFS_DIFLAG_RTINHERIT) &&
+ (new_diflags & XFS_DIFLAG_EXTSZINHERIT)) {
+ unsigned int rtextsize_bytes;
+
+ rtextsize_bytes = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize);
+ if (fa->fsx_extsize % rtextsize_bytes)
return -EINVAL;
}
- if (fa->fsx_extsize % size)
- return -EINVAL;
-
- return 0;
+ failaddr = xfs_inode_validate_extsize(ip->i_mount,
+ XFS_B_TO_FSB(mp, fa->fsx_extsize),
+ VFS_I(ip)->i_mode, new_diflags);
+ return failaddr != NULL ? -EINVAL : 0;
}
-/*
- * CoW extent size hint validation rules are:
- *
- * 1. CoW extent size hint can only be set if reflink is enabled on the fs.
- * The inode does not have to have any shared blocks, but it must be a v3.
- * 2. FS_XFLAG_COWEXTSIZE is only valid for directories and regular files;
- * for a directory, the hint is propagated to new files.
- * 3. Can be changed on files & directories at any time.
- * 4. CoW extsize hint of 0 turns off hints, clears inode flags.
- * 5. Extent size must be a multiple of the appropriate block size.
- * 6. The extent size hint must be limited to half the AG size to avoid
- * alignment extending the extent beyond the limits of the AG.
- *
- * Please keep this function in sync with xfs_scrub_inode_cowextsize.
- */
static int
xfs_ioctl_setattr_check_cowextsize(
struct xfs_inode *ip,
struct fileattr *fa)
{
struct xfs_mount *mp = ip->i_mount;
- xfs_extlen_t size;
- xfs_fsblock_t cowextsize_fsb;
+ xfs_failaddr_t failaddr;
+ uint64_t new_diflags2;
+ uint16_t new_diflags;
if (!fa->fsx_valid)
return 0;
- if (!(fa->fsx_xflags & FS_XFLAG_COWEXTSIZE))
- return 0;
-
- if (!xfs_sb_version_hasreflink(&ip->i_mount->m_sb))
+ if (fa->fsx_cowextsize & mp->m_blockmask)
return -EINVAL;
- if (fa->fsx_cowextsize == 0)
- return 0;
+ new_diflags = xfs_flags2diflags(ip, fa->fsx_xflags);
+ new_diflags2 = xfs_flags2diflags2(ip, fa->fsx_xflags);
- cowextsize_fsb = XFS_B_TO_FSB(mp, fa->fsx_cowextsize);
- if (cowextsize_fsb > MAXEXTLEN)
- return -EINVAL;
-
- size = mp->m_sb.sb_blocksize;
- if (cowextsize_fsb > mp->m_sb.sb_agblocks / 2)
- return -EINVAL;
-
- if (fa->fsx_cowextsize % size)
- return -EINVAL;
-
- return 0;
+ failaddr = xfs_inode_validate_cowextsize(ip->i_mount,
+ XFS_B_TO_FSB(mp, fa->fsx_cowextsize),
+ VFS_I(ip)->i_mode, new_diflags, new_diflags2);
+ return failaddr != NULL ? -EINVAL : 0;
}
static int
@@ -1544,7 +1511,7 @@ xfs_ioc_getbmap(
switch (cmd) {
case XFS_IOC_GETBMAPA:
bmx.bmv_iflags = BMV_IF_ATTRFORK;
- /*FALLTHRU*/
+ fallthrough;
case XFS_IOC_GETBMAP:
/* struct getbmap is a strict subset of struct getbmapx. */
recsize = sizeof(struct getbmap);
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index d154f42e2dc6..d8cd2583dedb 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -1036,7 +1036,7 @@ retry:
prealloc_blocks = 0;
goto retry;
}
- /*FALLTHRU*/
+ fallthrough;
default:
goto out_unlock;
}
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index c19a82adea1e..a002425377b5 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -2626,6 +2626,7 @@ xlog_covered_state(
case XLOG_STATE_COVER_IDLE:
if (iclogs_changed == 1)
return XLOG_STATE_COVER_IDLE;
+ fallthrough;
case XLOG_STATE_COVER_NEED:
case XLOG_STATE_COVER_NEED2:
break;
diff --git a/fs/xfs/xfs_message.h b/fs/xfs/xfs_message.h
index 3c392b1512ac..bb9860ec9a93 100644
--- a/fs/xfs/xfs_message.h
+++ b/fs/xfs/xfs_message.h
@@ -2,6 +2,8 @@
#ifndef __XFS_MESSAGE_H
#define __XFS_MESSAGE_H 1
+#include <linux/once_lite.h>
+
struct xfs_mount;
extern __printf(2, 3)
@@ -41,16 +43,7 @@ do { \
} while (0)
#define xfs_printk_once(func, dev, fmt, ...) \
-({ \
- static bool __section(".data.once") __print_once; \
- bool __ret_print_once = !__print_once; \
- \
- if (!__print_once) { \
- __print_once = true; \
- func(dev, fmt, ##__VA_ARGS__); \
- } \
- unlikely(__ret_print_once); \
-})
+ DO_ONCE_LITE(func, dev, fmt, ##__VA_ARGS__)
#define xfs_emerg_ratelimited(dev, fmt, ...) \
xfs_printk_ratelimited(xfs_emerg, dev, fmt, ##__VA_ARGS__)
@@ -73,6 +66,8 @@ do { \
xfs_printk_once(xfs_warn, dev, fmt, ##__VA_ARGS__)
#define xfs_notice_once(dev, fmt, ...) \
xfs_printk_once(xfs_notice, dev, fmt, ##__VA_ARGS__)
+#define xfs_info_once(dev, fmt, ...) \
+ xfs_printk_once(xfs_info, dev, fmt, ##__VA_ARGS__)
void assfail(struct xfs_mount *mp, char *expr, char *f, int l);
void asswarn(struct xfs_mount *mp, char *expr, char *f, int l);
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 9aced0a00003..d11d032da0b4 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -294,7 +294,7 @@ xfs_trans_read_buf_map(
default:
if (tp && (tp->t_flags & XFS_TRANS_DIRTY))
xfs_force_shutdown(tp->t_mountp, SHUTDOWN_META_IO_ERROR);
- /* fall through */
+ fallthrough;
case -ENOMEM:
case -EAGAIN:
return error;
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index cd145d318b17..dbf03635869c 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -5,7 +5,7 @@
* Copyright (C) 2019 Western Digital Corporation or its affiliates.
*/
#include <linux/module.h>
-#include <linux/fs.h>
+#include <linux/pagemap.h>
#include <linux/magic.h>
#include <linux/iomap.h>
#include <linux/init.h>
@@ -185,7 +185,7 @@ static const struct address_space_operations zonefs_file_aops = {
.readahead = zonefs_readahead,
.writepage = zonefs_writepage,
.writepages = zonefs_writepages,
- .set_page_dirty = iomap_set_page_dirty,
+ .set_page_dirty = __set_page_dirty_nobuffers,
.releasepage = iomap_releasepage,
.invalidatepage = iomap_invalidatepage,
.migratepage = iomap_migrate_page,