summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/v9fs.c10
-rw-r--r--fs/9p/v9fs.h1
-rw-r--r--fs/9p/vfs_super.c15
-rw-r--r--fs/afs/internal.h1
-rw-r--r--fs/afs/rxrpc.c12
-rw-r--r--fs/afs/super.c5
-rw-r--r--fs/afs/volume.c8
-rw-r--r--fs/autofs4/Kconfig2
-rw-r--r--fs/befs/linuxvfs.c15
-rw-r--r--fs/block_dev.c156
-rw-r--r--fs/btrfs/ctree.h1
-rw-r--r--fs/btrfs/disk-io.c36
-rw-r--r--fs/btrfs/inode.c22
-rw-r--r--fs/btrfs/qgroup.c11
-rw-r--r--fs/btrfs/super.c10
-rw-r--r--fs/btrfs/volumes.c2
-rw-r--r--fs/ceph/addr.c6
-rw-r--r--fs/ceph/debugfs.c2
-rw-r--r--fs/ceph/inode.c22
-rw-r--r--fs/ceph/super.c35
-rw-r--r--fs/ceph/super.h2
-rw-r--r--fs/char_dev.c86
-rw-r--r--fs/cifs/cifs_fs_sb.h1
-rw-r--r--fs/cifs/cifs_unicode.c6
-rw-r--r--fs/cifs/cifs_unicode.h5
-rw-r--r--fs/cifs/cifsfs.c23
-rw-r--r--fs/cifs/cifsglob.h21
-rw-r--r--fs/cifs/cifsproto.h6
-rw-r--r--fs/cifs/cifssmb.c18
-rw-r--r--fs/cifs/connect.c37
-rw-r--r--fs/cifs/file.c363
-rw-r--r--fs/cifs/ioctl.c7
-rw-r--r--fs/cifs/misc.c136
-rw-r--r--fs/cifs/netmisc.c6
-rw-r--r--fs/cifs/smb1ops.c10
-rw-r--r--fs/cifs/smb2misc.c5
-rw-r--r--fs/cifs/smb2ops.c5
-rw-r--r--fs/cifs/smb2pdu.c34
-rw-r--r--fs/cifs/smb2transport.c30
-rw-r--r--fs/cifs/transport.c32
-rw-r--r--fs/coda/inode.c11
-rw-r--r--fs/compat.c1191
-rw-r--r--fs/dax.c297
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h1
-rw-r--r--fs/ecryptfs/main.c4
-rw-r--r--fs/eventpoll.c93
-rw-r--r--fs/exec.c1
-rw-r--r--fs/exofs/exofs.h1
-rw-r--r--fs/exofs/super.c17
-rw-r--r--fs/ext2/ext2.h4
-rw-r--r--fs/ext2/inode.c31
-rw-r--r--fs/ext2/ioctl.c1
-rw-r--r--fs/ext2/super.c87
-rw-r--r--fs/ext4/ext4.h1
-rw-r--r--fs/ext4/inode.c35
-rw-r--r--fs/ext4/ioctl.c2
-rw-r--r--fs/ext4/super.c74
-rw-r--r--fs/fcntl.c157
-rw-r--r--fs/fhandle.c13
-rw-r--r--fs/fuse/dev.c13
-rw-r--r--fs/fuse/fuse_i.h6
-rw-r--r--fs/fuse/inode.c42
-rw-r--r--fs/gfs2/bmap.c741
-rw-r--r--fs/gfs2/file.c6
-rw-r--r--fs/gfs2/glock.c81
-rw-r--r--fs/gfs2/incore.h8
-rw-r--r--fs/gfs2/inode.c4
-rw-r--r--fs/gfs2/ops_fstype.c8
-rw-r--r--fs/gfs2/rgrp.c7
-rw-r--r--fs/gfs2/rgrp.h7
-rw-r--r--fs/gfs2/super.c11
-rw-r--r--fs/hugetlbfs/inode.c15
-rw-r--r--fs/inode.c3
-rw-r--r--fs/internal.h2
-rw-r--r--fs/iomap.c24
-rw-r--r--fs/jbd2/journal.c9
-rw-r--r--fs/jbd2/transaction.c12
-rw-r--r--fs/jfs/ioctl.c2
-rw-r--r--fs/jfs/jfs_imap.c1
-rw-r--r--fs/jfs/jfs_inode.c18
-rw-r--r--fs/jfs/jfs_inode.h1
-rw-r--r--fs/jfs/super.c79
-rw-r--r--fs/mount.h2
-rw-r--r--fs/namei.c23
-rw-r--r--fs/namespace.c3
-rw-r--r--fs/ncpfs/inode.c8
-rw-r--r--fs/ncpfs/ncp_fs_sb.h1
-rw-r--r--fs/nfs/client.c10
-rw-r--r--fs/nfs/direct.c27
-rw-r--r--fs/nfs/super.c11
-rw-r--r--fs/nfs/write.c13
-rw-r--r--fs/nfsd/blocklayout.c7
-rw-r--r--fs/nfsd/nfs3xdr.c13
-rw-r--r--fs/nfsd/nfs4proc.c2
-rw-r--r--fs/nfsd/nfssvc.c36
-rw-r--r--fs/nfsd/nfsxdr.c10
-rw-r--r--fs/nfsd/vfs.c2
-rw-r--r--fs/nilfs2/super.c2
-rw-r--r--fs/notify/Makefile4
-rw-r--r--fs/notify/dnotify/dnotify.c25
-rw-r--r--fs/notify/fanotify/fanotify.c26
-rw-r--r--fs/notify/fanotify/fanotify.h1
-rw-r--r--fs/notify/fanotify/fanotify_user.c77
-rw-r--r--fs/notify/fdinfo.c16
-rw-r--r--fs/notify/fsnotify.c107
-rw-r--r--fs/notify/fsnotify.h48
-rw-r--r--fs/notify/group.c20
-rw-r--r--fs/notify/inode_mark.c199
-rw-r--r--fs/notify/inotify/inotify.h4
-rw-r--r--fs/notify/inotify/inotify_fsnotify.c18
-rw-r--r--fs/notify/inotify/inotify_user.c81
-rw-r--r--fs/notify/mark.c642
-rw-r--r--fs/notify/vfsmount_mark.c108
-rw-r--r--fs/nsfs.c1
-rw-r--r--fs/ocfs2/cluster/heartbeat.c8
-rw-r--r--fs/ocfs2/cluster/tcp.c32
-rw-r--r--fs/open.c20
-rw-r--r--fs/orangefs/devorangefs-req.c13
-rw-r--r--fs/orangefs/dir.c640
-rw-r--r--fs/orangefs/downcall.h16
-rw-r--r--fs/orangefs/file.c9
-rw-r--r--fs/orangefs/inode.c19
-rw-r--r--fs/orangefs/namei.c5
-rw-r--r--fs/orangefs/orangefs-bufmap.c4
-rw-r--r--fs/orangefs/orangefs-debugfs.c3
-rw-r--r--fs/orangefs/orangefs-dev-proto.h7
-rw-r--r--fs/orangefs/orangefs-kernel.h10
-rw-r--r--fs/orangefs/orangefs-utils.c98
-rw-r--r--fs/orangefs/protocol.h9
-rw-r--r--fs/orangefs/super.c53
-rw-r--r--fs/orangefs/waitqueue.c9
-rw-r--r--fs/orangefs/xattr.c26
-rw-r--r--fs/proc/base.c15
-rw-r--r--fs/proc/generic.c1
-rw-r--r--fs/proc/proc_sysctl.c4
-rw-r--r--fs/proc/task_mmu.c17
-rw-r--r--fs/pstore/ftrace.c11
-rw-r--r--fs/pstore/inode.c147
-rw-r--r--fs/pstore/internal.h8
-rw-r--r--fs/pstore/platform.c301
-rw-r--r--fs/pstore/pmsg.c12
-rw-r--r--fs/pstore/ram.c130
-rw-r--r--fs/pstore/ram_core.c2
-rw-r--r--fs/quota/dquot.c31
-rw-r--r--fs/read_write.c75
-rw-r--r--fs/readdir.c165
-rw-r--r--fs/reiserfs/inode.c31
-rw-r--r--fs/reiserfs/ioctl.c1
-rw-r--r--fs/reiserfs/journal.c2
-rw-r--r--fs/reiserfs/lbalance.c2
-rw-r--r--fs/reiserfs/reiserfs.h3
-rw-r--r--fs/reiserfs/super.c92
-rw-r--r--fs/select.c437
-rw-r--r--fs/splice.c20
-rw-r--r--fs/stat.c97
-rw-r--r--fs/statfs.c140
-rw-r--r--fs/super.c53
-rw-r--r--fs/ubifs/debug.c10
-rw-r--r--fs/ubifs/dir.c18
-rw-r--r--fs/ubifs/super.c25
-rw-r--r--fs/ubifs/ubifs.h3
-rw-r--r--fs/udf/file.c10
-rw-r--r--fs/udf/inode.c22
-rw-r--r--fs/udf/namei.c2
-rw-r--r--fs/utimes.c66
-rw-r--r--fs/xfs/Makefile1
-rw-r--r--fs/xfs/kmem.c12
-rw-r--r--fs/xfs/kmem.h2
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c57
-rw-r--r--fs/xfs/libxfs/xfs_alloc.h12
-rw-r--r--fs/xfs/libxfs/xfs_alloc_btree.c172
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c354
-rw-r--r--fs/xfs/libxfs/xfs_bmap.h14
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.c43
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.h22
-rw-r--r--fs/xfs/libxfs/xfs_btree.c17
-rw-r--r--fs/xfs/libxfs/xfs_btree.h2
-rw-r--r--fs/xfs/libxfs/xfs_dquot_buf.c7
-rw-r--r--fs/xfs/libxfs/xfs_format.h11
-rw-r--r--fs/xfs/libxfs/xfs_fs.h13
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.c2
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.c90
-rw-r--r--fs/xfs/libxfs/xfs_rmap.c56
-rw-r--r--fs/xfs/libxfs/xfs_rmap.h4
-rw-r--r--fs/xfs/libxfs/xfs_rtbitmap.c70
-rw-r--r--fs/xfs/libxfs/xfs_trans_space.h23
-rw-r--r--fs/xfs/xfs_aops.c18
-rw-r--r--fs/xfs/xfs_bmap_item.c6
-rw-r--r--fs/xfs/xfs_bmap_util.c22
-rw-r--r--fs/xfs/xfs_buf.c32
-rw-r--r--fs/xfs/xfs_buf.h2
-rw-r--r--fs/xfs/xfs_dir2_readdir.c15
-rw-r--r--fs/xfs/xfs_discard.c10
-rw-r--r--fs/xfs/xfs_extfree_item.c1
-rw-r--r--fs/xfs/xfs_fsmap.c940
-rw-r--r--fs/xfs/xfs_fsmap.h53
-rw-r--r--fs/xfs/xfs_icache.c58
-rw-r--r--fs/xfs/xfs_icache.h8
-rw-r--r--fs/xfs/xfs_inode.c9
-rw-r--r--fs/xfs/xfs_inode.h4
-rw-r--r--fs/xfs/xfs_inode_item.c29
-rw-r--r--fs/xfs/xfs_ioctl.c89
-rw-r--r--fs/xfs/xfs_ioctl32.c2
-rw-r--r--fs/xfs/xfs_iomap.c18
-rw-r--r--fs/xfs/xfs_linux.h85
-rw-r--r--fs/xfs/xfs_log.c4
-rw-r--r--fs/xfs/xfs_mount.c4
-rw-r--r--fs/xfs/xfs_mount.h5
-rw-r--r--fs/xfs/xfs_qm.c11
-rw-r--r--fs/xfs/xfs_qm_syscalls.c3
-rw-r--r--fs/xfs/xfs_refcount_item.c1
-rw-r--r--fs/xfs/xfs_reflink.c39
-rw-r--r--fs/xfs/xfs_rmap_item.c1
-rw-r--r--fs/xfs/xfs_rtalloc.h22
-rw-r--r--fs/xfs/xfs_super.c8
-rw-r--r--fs/xfs/xfs_trace.c1
-rw-r--r--fs/xfs/xfs_trace.h144
-rw-r--r--fs/xfs/xfs_trans.c51
-rw-r--r--fs/xfs/xfs_trans.h3
-rw-r--r--fs/xfs/xfs_trans_ail.c71
-rw-r--r--fs/xfs/xfs_trans_priv.h15
221 files changed, 6945 insertions, 4630 deletions
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index a89f3cfe3c7d..c202930086ed 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -333,10 +333,6 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
goto err_names;
init_rwsem(&v9ses->rename_sem);
- rc = bdi_setup_and_register(&v9ses->bdi, "9p");
- if (rc)
- goto err_names;
-
v9ses->uid = INVALID_UID;
v9ses->dfltuid = V9FS_DEFUID;
v9ses->dfltgid = V9FS_DEFGID;
@@ -345,7 +341,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
if (IS_ERR(v9ses->clnt)) {
rc = PTR_ERR(v9ses->clnt);
p9_debug(P9_DEBUG_ERROR, "problem initializing 9p client\n");
- goto err_bdi;
+ goto err_names;
}
v9ses->flags = V9FS_ACCESS_USER;
@@ -415,8 +411,6 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
err_clnt:
p9_client_destroy(v9ses->clnt);
-err_bdi:
- bdi_destroy(&v9ses->bdi);
err_names:
kfree(v9ses->uname);
kfree(v9ses->aname);
@@ -445,8 +439,6 @@ void v9fs_session_close(struct v9fs_session_info *v9ses)
kfree(v9ses->uname);
kfree(v9ses->aname);
- bdi_destroy(&v9ses->bdi);
-
spin_lock(&v9fs_sessionlist_lock);
list_del(&v9ses->slist);
spin_unlock(&v9fs_sessionlist_lock);
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index 443d12e02043..76eaf49abd3a 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -114,7 +114,6 @@ struct v9fs_session_info {
kuid_t uid; /* if ACCESS_SINGLE, the uid that has access */
struct p9_client *clnt; /* 9p client */
struct list_head slist; /* list of sessions registered with v9fs */
- struct backing_dev_info bdi;
struct rw_semaphore rename_sem;
};
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index de3ed8629196..a0965fb587a5 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -72,10 +72,12 @@ static int v9fs_set_super(struct super_block *s, void *data)
*
*/
-static void
+static int
v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
int flags, void *data)
{
+ int ret;
+
sb->s_maxbytes = MAX_LFS_FILESIZE;
sb->s_blocksize_bits = fls(v9ses->maxdata - 1);
sb->s_blocksize = 1 << sb->s_blocksize_bits;
@@ -85,7 +87,11 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
sb->s_xattr = v9fs_xattr_handlers;
} else
sb->s_op = &v9fs_super_ops;
- sb->s_bdi = &v9ses->bdi;
+
+ ret = super_setup_bdi(sb);
+ if (ret)
+ return ret;
+
if (v9ses->cache)
sb->s_bdi->ra_pages = (VM_MAX_READAHEAD * 1024)/PAGE_SIZE;
@@ -99,6 +105,7 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
#endif
save_mount_options(sb, data);
+ return 0;
}
/**
@@ -138,7 +145,9 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
retval = PTR_ERR(sb);
goto clunk_fid;
}
- v9fs_fill_super(sb, v9ses, flags, data);
+ retval = v9fs_fill_super(sb, v9ses, flags, data);
+ if (retval)
+ goto release_sb;
if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
sb->s_d_op = &v9fs_cached_dentry_operations;
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index a6901360fb81..393672997cc2 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -318,7 +318,6 @@ struct afs_volume {
unsigned short rjservers; /* number of servers discarded due to -ENOMEDIUM */
struct afs_server *servers[8]; /* servers on which volume resides (ordered) */
struct rw_semaphore server_sem; /* lock for accessing current server */
- struct backing_dev_info bdi;
};
/*
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 8f76b13d5549..d5990eb160bd 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -419,7 +419,7 @@ error_do_abort:
call->state = AFS_CALL_COMPLETE;
if (ret != -ECONNABORTED) {
rxrpc_kernel_abort_call(afs_socket, rxcall, RX_USER_ABORT,
- -ret, "KSD");
+ ret, "KSD");
} else {
abort_code = 0;
offset = 0;
@@ -478,12 +478,12 @@ static void afs_deliver_to_call(struct afs_call *call)
case -ENOTCONN:
abort_code = RX_CALL_DEAD;
rxrpc_kernel_abort_call(afs_socket, call->rxcall,
- abort_code, -ret, "KNC");
+ abort_code, ret, "KNC");
goto save_error;
case -ENOTSUPP:
abort_code = RXGEN_OPCODE;
rxrpc_kernel_abort_call(afs_socket, call->rxcall,
- abort_code, -ret, "KIV");
+ abort_code, ret, "KIV");
goto save_error;
case -ENODATA:
case -EBADMSG:
@@ -493,7 +493,7 @@ static void afs_deliver_to_call(struct afs_call *call)
if (call->state != AFS_CALL_AWAIT_REPLY)
abort_code = RXGEN_SS_UNMARSHAL;
rxrpc_kernel_abort_call(afs_socket, call->rxcall,
- abort_code, EBADMSG, "KUM");
+ abort_code, -EBADMSG, "KUM");
goto save_error;
}
}
@@ -754,7 +754,7 @@ void afs_send_empty_reply(struct afs_call *call)
case -ENOMEM:
_debug("oom");
rxrpc_kernel_abort_call(afs_socket, call->rxcall,
- RX_USER_ABORT, ENOMEM, "KOO");
+ RX_USER_ABORT, -ENOMEM, "KOO");
default:
_leave(" [error]");
return;
@@ -792,7 +792,7 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len)
if (n == -ENOMEM) {
_debug("oom");
rxrpc_kernel_abort_call(afs_socket, call->rxcall,
- RX_USER_ABORT, ENOMEM, "KOO");
+ RX_USER_ABORT, -ENOMEM, "KOO");
}
_leave(" [error]");
}
diff --git a/fs/afs/super.c b/fs/afs/super.c
index fbdb022b75a2..c79633e5cfd8 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -319,7 +319,10 @@ static int afs_fill_super(struct super_block *sb,
sb->s_blocksize_bits = PAGE_SHIFT;
sb->s_magic = AFS_FS_MAGIC;
sb->s_op = &afs_super_ops;
- sb->s_bdi = &as->volume->bdi;
+ ret = super_setup_bdi(sb);
+ if (ret)
+ return ret;
+ sb->s_bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
strlcpy(sb->s_id, as->volume->vlocation->vldb.name, sizeof(sb->s_id));
/* allocate the root inode and dentry */
diff --git a/fs/afs/volume.c b/fs/afs/volume.c
index 546f9d01710b..db73d6dad02b 100644
--- a/fs/afs/volume.c
+++ b/fs/afs/volume.c
@@ -106,11 +106,6 @@ struct afs_volume *afs_volume_lookup(struct afs_mount_params *params)
volume->cell = params->cell;
volume->vid = vlocation->vldb.vid[params->type];
- volume->bdi.ra_pages = VM_MAX_READAHEAD*1024/PAGE_SIZE;
- ret = bdi_setup_and_register(&volume->bdi, "afs");
- if (ret)
- goto error_bdi;
-
init_rwsem(&volume->server_sem);
/* look up all the applicable server records */
@@ -156,8 +151,6 @@ error:
return ERR_PTR(ret);
error_discard:
- bdi_destroy(&volume->bdi);
-error_bdi:
up_write(&params->cell->vl_sem);
for (loop = volume->nservers - 1; loop >= 0; loop--)
@@ -207,7 +200,6 @@ void afs_put_volume(struct afs_volume *volume)
for (loop = volume->nservers - 1; loop >= 0; loop--)
afs_put_server(volume->servers[loop]);
- bdi_destroy(&volume->bdi);
kfree(volume);
_leave(" [destroyed]");
diff --git a/fs/autofs4/Kconfig b/fs/autofs4/Kconfig
index 1204d6384d39..44727bf18297 100644
--- a/fs/autofs4/Kconfig
+++ b/fs/autofs4/Kconfig
@@ -7,7 +7,7 @@ config AUTOFS4_FS
automounter (amd), which is a pure user space daemon.
To use the automounter you need the user-space tools from
- <ftp://ftp.kernel.org/pub/linux/daemons/autofs/v4/>; you also
+ <https://www.kernel.org/pub/linux/daemons/autofs/v4/>; you also
want to answer Y to "NFS file system support", below.
To compile this support as a module, choose M here: the module will be
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index c500e954debb..63e7c4760bfb 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -58,6 +58,7 @@ static struct dentry *befs_fh_to_dentry(struct super_block *sb,
struct fid *fid, int fh_len, int fh_type);
static struct dentry *befs_fh_to_parent(struct super_block *sb,
struct fid *fid, int fh_len, int fh_type);
+static struct dentry *befs_get_parent(struct dentry *child);
static const struct super_operations befs_sops = {
.alloc_inode = befs_alloc_inode, /* allocate a new inode */
@@ -93,6 +94,7 @@ static const struct address_space_operations befs_symlink_aops = {
static const struct export_operations befs_export_operations = {
.fh_to_dentry = befs_fh_to_dentry,
.fh_to_parent = befs_fh_to_parent,
+ .get_parent = befs_get_parent,
};
/*
@@ -667,6 +669,19 @@ static struct dentry *befs_fh_to_parent(struct super_block *sb,
befs_nfs_get_inode);
}
+static struct dentry *befs_get_parent(struct dentry *child)
+{
+ struct inode *parent;
+ struct befs_inode_info *befs_ino = BEFS_I(d_inode(child));
+
+ parent = befs_iget(child->d_sb,
+ (unsigned long)befs_ino->i_parent.start);
+ if (IS_ERR(parent))
+ return ERR_CAST(parent);
+
+ return d_obtain_alias(parent);
+}
+
enum {
Opt_uid, Opt_gid, Opt_charset, Opt_debug, Opt_err,
};
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 2eca00ec4370..2a305c1a2d88 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -18,6 +18,7 @@
#include <linux/module.h>
#include <linux/blkpg.h>
#include <linux/magic.h>
+#include <linux/dax.h>
#include <linux/buffer_head.h>
#include <linux/swap.h>
#include <linux/pagevec.h>
@@ -103,12 +104,11 @@ void invalidate_bdev(struct block_device *bdev)
{
struct address_space *mapping = bdev->bd_inode->i_mapping;
- if (mapping->nrpages == 0)
- return;
-
- invalidate_bh_lrus();
- lru_add_drain_all(); /* make sure all lru add caches are flushed */
- invalidate_mapping_pages(mapping, 0, -1);
+ if (mapping->nrpages) {
+ invalidate_bh_lrus();
+ lru_add_drain_all(); /* make sure all lru add caches are flushed */
+ invalidate_mapping_pages(mapping, 0, -1);
+ }
/* 99% of the time, we don't need to flush the cleancache on the bdev.
* But, for the strange corners, lets be cautious
*/
@@ -717,50 +717,18 @@ int bdev_write_page(struct block_device *bdev, sector_t sector,
}
EXPORT_SYMBOL_GPL(bdev_write_page);
-/**
- * bdev_direct_access() - Get the address for directly-accessibly memory
- * @bdev: The device containing the memory
- * @dax: control and output parameters for ->direct_access
- *
- * If a block device is made up of directly addressable memory, this function
- * will tell the caller the PFN and the address of the memory. The address
- * may be directly dereferenced within the kernel without the need to call
- * ioremap(), kmap() or similar. The PFN is suitable for inserting into
- * page tables.
- *
- * Return: negative errno if an error occurs, otherwise the number of bytes
- * accessible at this address.
- */
-long bdev_direct_access(struct block_device *bdev, struct blk_dax_ctl *dax)
+int bdev_dax_pgoff(struct block_device *bdev, sector_t sector, size_t size,
+ pgoff_t *pgoff)
{
- sector_t sector = dax->sector;
- long avail, size = dax->size;
- const struct block_device_operations *ops = bdev->bd_disk->fops;
+ phys_addr_t phys_off = (get_start_sect(bdev) + sector) * 512;
- /*
- * The device driver is allowed to sleep, in order to make the
- * memory directly accessible.
- */
- might_sleep();
-
- if (size < 0)
- return size;
- if (!blk_queue_dax(bdev_get_queue(bdev)) || !ops->direct_access)
- return -EOPNOTSUPP;
- if ((sector + DIV_ROUND_UP(size, 512)) >
- part_nr_sects_read(bdev->bd_part))
- return -ERANGE;
- sector += get_start_sect(bdev);
- if (sector % (PAGE_SIZE / 512))
+ if (pgoff)
+ *pgoff = PHYS_PFN(phys_off);
+ if (phys_off % PAGE_SIZE || size % PAGE_SIZE)
return -EINVAL;
- avail = ops->direct_access(bdev, sector, &dax->addr, &dax->pfn, size);
- if (!avail)
- return -ERANGE;
- if (avail > 0 && avail & ~PAGE_MASK)
- return -ENXIO;
- return min(avail, size);
+ return 0;
}
-EXPORT_SYMBOL_GPL(bdev_direct_access);
+EXPORT_SYMBOL(bdev_dax_pgoff);
/**
* bdev_dax_supported() - Check if the device supports dax for filesystem
@@ -774,62 +742,46 @@ EXPORT_SYMBOL_GPL(bdev_direct_access);
*/
int bdev_dax_supported(struct super_block *sb, int blocksize)
{
- struct blk_dax_ctl dax = {
- .sector = 0,
- .size = PAGE_SIZE,
- };
- int err;
+ struct block_device *bdev = sb->s_bdev;
+ struct dax_device *dax_dev;
+ pgoff_t pgoff;
+ int err, id;
+ void *kaddr;
+ pfn_t pfn;
+ long len;
if (blocksize != PAGE_SIZE) {
vfs_msg(sb, KERN_ERR, "error: unsupported blocksize for dax");
return -EINVAL;
}
- err = bdev_direct_access(sb->s_bdev, &dax);
- if (err < 0) {
- switch (err) {
- case -EOPNOTSUPP:
- vfs_msg(sb, KERN_ERR,
- "error: device does not support dax");
- break;
- case -EINVAL:
- vfs_msg(sb, KERN_ERR,
- "error: unaligned partition for dax");
- break;
- default:
- vfs_msg(sb, KERN_ERR,
- "error: dax access failed (%d)", err);
- }
+ err = bdev_dax_pgoff(bdev, 0, PAGE_SIZE, &pgoff);
+ if (err) {
+ vfs_msg(sb, KERN_ERR, "error: unaligned partition for dax");
return err;
}
- return 0;
-}
-EXPORT_SYMBOL_GPL(bdev_dax_supported);
-
-/**
- * bdev_dax_capable() - Return if the raw device is capable for dax
- * @bdev: The device for raw block device access
- */
-bool bdev_dax_capable(struct block_device *bdev)
-{
- struct blk_dax_ctl dax = {
- .size = PAGE_SIZE,
- };
+ dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
+ if (!dax_dev) {
+ vfs_msg(sb, KERN_ERR, "error: device does not support dax");
+ return -EOPNOTSUPP;
+ }
- if (!IS_ENABLED(CONFIG_FS_DAX))
- return false;
+ id = dax_read_lock();
+ len = dax_direct_access(dax_dev, pgoff, 1, &kaddr, &pfn);
+ dax_read_unlock(id);
- dax.sector = 0;
- if (bdev_direct_access(bdev, &dax) < 0)
- return false;
+ put_dax(dax_dev);
- dax.sector = bdev->bd_part->nr_sects - (PAGE_SIZE / 512);
- if (bdev_direct_access(bdev, &dax) < 0)
- return false;
+ if (len < 1) {
+ vfs_msg(sb, KERN_ERR,
+ "error: dax access failed (%ld)", len);
+ return len < 0 ? len : -EIO;
+ }
- return true;
+ return 0;
}
+EXPORT_SYMBOL_GPL(bdev_dax_supported);
/*
* pseudo-fs
@@ -885,6 +837,8 @@ static void bdev_evict_inode(struct inode *inode)
spin_lock(&bdev_lock);
list_del_init(&bdev->bd_list);
spin_unlock(&bdev_lock);
+ /* Detach inode from wb early as bdi_put() may free bdi->wb */
+ inode_detach_wb(inode);
if (bdev->bd_bdi != &noop_backing_dev_info) {
bdi_put(bdev->bd_bdi);
bdev->bd_bdi = &noop_backing_dev_info;
@@ -1451,7 +1405,6 @@ int revalidate_disk(struct gendisk *disk)
if (disk->fops->revalidate_disk)
ret = disk->fops->revalidate_disk(disk);
- blk_integrity_revalidate(disk);
bdev = bdget_disk(disk, 0);
if (!bdev)
return ret;
@@ -1556,8 +1509,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
bdev->bd_disk = disk;
bdev->bd_queue = disk->queue;
bdev->bd_contains = bdev;
- if (bdev->bd_bdi == &noop_backing_dev_info)
- bdev->bd_bdi = bdi_get(disk->queue->backing_dev_info);
if (!partno) {
ret = -ENXIO;
@@ -1622,6 +1573,9 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
}
bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9);
}
+
+ if (bdev->bd_bdi == &noop_backing_dev_info)
+ bdev->bd_bdi = bdi_get(disk->queue->backing_dev_info);
} else {
if (bdev->bd_contains == bdev) {
ret = 0;
@@ -1653,8 +1607,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
bdev->bd_disk = NULL;
bdev->bd_part = NULL;
bdev->bd_queue = NULL;
- bdi_put(bdev->bd_bdi);
- bdev->bd_bdi = &noop_backing_dev_info;
if (bdev != bdev->bd_contains)
__blkdev_put(bdev->bd_contains, mode, 1);
bdev->bd_contains = NULL;
@@ -1876,12 +1828,6 @@ static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
kill_bdev(bdev);
bdev_write_inode(bdev);
- /*
- * Detaching bdev inode from its wb in __destroy_inode()
- * is too late: the queue which embeds its bdi (along with
- * root wb) can be gone as soon as we put_disk() below.
- */
- inode_detach_wb(bdev->bd_inode);
}
if (bdev->bd_contains == bdev) {
if (disk->fops->release)
@@ -2074,7 +2020,6 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start,
loff_t len)
{
struct block_device *bdev = I_BDEV(bdev_file_inode(file));
- struct request_queue *q = bdev_get_queue(bdev);
struct address_space *mapping;
loff_t end = start + len - 1;
loff_t isize;
@@ -2110,18 +2055,13 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start,
case FALLOC_FL_ZERO_RANGE:
case FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE:
error = blkdev_issue_zeroout(bdev, start >> 9, len >> 9,
- GFP_KERNEL, false);
+ GFP_KERNEL, BLKDEV_ZERO_NOUNMAP);
break;
case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE:
- /* Only punch if the device can do zeroing discard. */
- if (!blk_queue_discard(q) || !q->limits.discard_zeroes_data)
- return -EOPNOTSUPP;
- error = blkdev_issue_discard(bdev, start >> 9, len >> 9,
- GFP_KERNEL, 0);
+ error = blkdev_issue_zeroout(bdev, start >> 9, len >> 9,
+ GFP_KERNEL, BLKDEV_ZERO_NOFALLBACK);
break;
case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE | FALLOC_FL_NO_HIDE_STALE:
- if (!blk_queue_discard(q))
- return -EOPNOTSUPP;
error = blkdev_issue_discard(bdev, start >> 9, len >> 9,
GFP_KERNEL, 0);
break;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index c4115901d906..3e21211e99c3 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -810,7 +810,6 @@ struct btrfs_fs_info {
struct btrfs_super_block *super_for_commit;
struct super_block *sb;
struct inode *btree_inode;
- struct backing_dev_info bdi;
struct mutex tree_log_mutex;
struct mutex transaction_kthread_mutex;
struct mutex cleaner_mutex;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index eb1ee7b6f532..061c1d1f774f 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1808,21 +1808,6 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits)
return ret;
}
-static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
-{
- int err;
-
- err = bdi_setup_and_register(bdi, "btrfs");
- if (err)
- return err;
-
- bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
- bdi->congested_fn = btrfs_congested_fn;
- bdi->congested_data = info;
- bdi->capabilities |= BDI_CAP_CGROUP_WRITEBACK;
- return 0;
-}
-
/*
* called by the kthread helper functions to finally call the bio end_io
* functions. This is where read checksum verification actually happens
@@ -2601,16 +2586,10 @@ int open_ctree(struct super_block *sb,
goto fail;
}
- ret = setup_bdi(fs_info, &fs_info->bdi);
- if (ret) {
- err = ret;
- goto fail_srcu;
- }
-
ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL);
if (ret) {
err = ret;
- goto fail_bdi;
+ goto fail_srcu;
}
fs_info->dirty_metadata_batch = PAGE_SIZE *
(1 + ilog2(nr_cpu_ids));
@@ -2718,7 +2697,6 @@ int open_ctree(struct super_block *sb,
sb->s_blocksize = 4096;
sb->s_blocksize_bits = blksize_bits(4096);
- sb->s_bdi = &fs_info->bdi;
btrfs_init_btree_inode(fs_info);
@@ -2915,9 +2893,12 @@ int open_ctree(struct super_block *sb,
goto fail_sb_buffer;
}
- fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
- fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
- SZ_4M / PAGE_SIZE);
+ sb->s_bdi->congested_fn = btrfs_congested_fn;
+ sb->s_bdi->congested_data = fs_info;
+ sb->s_bdi->capabilities |= BDI_CAP_CGROUP_WRITEBACK;
+ sb->s_bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
+ sb->s_bdi->ra_pages *= btrfs_super_num_devices(disk_super);
+ sb->s_bdi->ra_pages = max(sb->s_bdi->ra_pages, SZ_4M / PAGE_SIZE);
sb->s_blocksize = sectorsize;
sb->s_blocksize_bits = blksize_bits(sectorsize);
@@ -3285,8 +3266,6 @@ fail_delalloc_bytes:
percpu_counter_destroy(&fs_info->delalloc_bytes);
fail_dirty_metadata_bytes:
percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
-fail_bdi:
- bdi_destroy(&fs_info->bdi);
fail_srcu:
cleanup_srcu_struct(&fs_info->subvol_srcu);
fail:
@@ -4007,7 +3986,6 @@ void close_ctree(struct btrfs_fs_info *fs_info)
percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
percpu_counter_destroy(&fs_info->delalloc_bytes);
percpu_counter_destroy(&fs_info->bio_counter);
- bdi_destroy(&fs_info->bdi);
cleanup_srcu_struct(&fs_info->subvol_srcu);
btrfs_free_stripe_hash_table(fs_info);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a18510be76c1..5e71f1ea3391 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7910,7 +7910,6 @@ struct btrfs_retry_complete {
static void btrfs_retry_endio_nocsum(struct bio *bio)
{
struct btrfs_retry_complete *done = bio->bi_private;
- struct inode *inode;
struct bio_vec *bvec;
int i;
@@ -7918,12 +7917,12 @@ static void btrfs_retry_endio_nocsum(struct bio *bio)
goto end;
ASSERT(bio->bi_vcnt == 1);
- inode = bio->bi_io_vec->bv_page->mapping->host;
- ASSERT(bio->bi_io_vec->bv_len == btrfs_inode_sectorsize(inode));
+ ASSERT(bio->bi_io_vec->bv_len == btrfs_inode_sectorsize(done->inode));
done->uptodate = 1;
bio_for_each_segment_all(bvec, bio, i)
- clean_io_failure(BTRFS_I(done->inode), done->start, bvec->bv_page, 0);
+ clean_io_failure(BTRFS_I(done->inode), done->start,
+ bvec->bv_page, 0);
end:
complete(&done->done);
bio_put(bio);
@@ -7973,8 +7972,10 @@ next_block_or_try_again:
start += sectorsize;
- if (nr_sectors--) {
+ nr_sectors--;
+ if (nr_sectors) {
pgoff += sectorsize;
+ ASSERT(pgoff < PAGE_SIZE);
goto next_block_or_try_again;
}
}
@@ -7986,9 +7987,7 @@ static void btrfs_retry_endio(struct bio *bio)
{
struct btrfs_retry_complete *done = bio->bi_private;
struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
- struct inode *inode;
struct bio_vec *bvec;
- u64 start;
int uptodate;
int ret;
int i;
@@ -7998,11 +7997,8 @@ static void btrfs_retry_endio(struct bio *bio)
uptodate = 1;
- start = done->start;
-
ASSERT(bio->bi_vcnt == 1);
- inode = bio->bi_io_vec->bv_page->mapping->host;
- ASSERT(bio->bi_io_vec->bv_len == btrfs_inode_sectorsize(inode));
+ ASSERT(bio->bi_io_vec->bv_len == btrfs_inode_sectorsize(done->inode));
bio_for_each_segment_all(bvec, bio, i) {
ret = __readpage_endio_check(done->inode, io_bio, i,
@@ -8080,8 +8076,10 @@ next:
ASSERT(nr_sectors);
- if (--nr_sectors) {
+ nr_sectors--;
+ if (nr_sectors) {
pgoff += sectorsize;
+ ASSERT(pgoff < PAGE_SIZE);
goto next_block;
}
}
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index a59801dc2a34..afbea61d957e 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1042,9 +1042,12 @@ static void report_reserved_underflow(struct btrfs_fs_info *fs_info,
struct btrfs_qgroup *qgroup,
u64 num_bytes)
{
- btrfs_warn(fs_info,
+#ifdef CONFIG_BTRFS_DEBUG
+ WARN_ON(qgroup->reserved < num_bytes);
+ btrfs_debug(fs_info,
"qgroup %llu reserved space underflow, have: %llu, to free: %llu",
qgroup->qgroupid, qgroup->reserved, num_bytes);
+#endif
qgroup->reserved = 0;
}
/*
@@ -1075,7 +1078,7 @@ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
qgroup->excl += sign * num_bytes;
qgroup->excl_cmpr += sign * num_bytes;
if (sign > 0) {
- if (WARN_ON(qgroup->reserved < num_bytes))
+ if (qgroup->reserved < num_bytes)
report_reserved_underflow(fs_info, qgroup, num_bytes);
else
qgroup->reserved -= num_bytes;
@@ -1100,7 +1103,7 @@ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
WARN_ON(sign < 0 && qgroup->excl < num_bytes);
qgroup->excl += sign * num_bytes;
if (sign > 0) {
- if (WARN_ON(qgroup->reserved < num_bytes))
+ if (qgroup->reserved < num_bytes)
report_reserved_underflow(fs_info, qgroup,
num_bytes);
else
@@ -2469,7 +2472,7 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
qg = unode_aux_to_qgroup(unode);
- if (WARN_ON(qg->reserved < num_bytes))
+ if (qg->reserved < num_bytes)
report_reserved_underflow(fs_info, qg, num_bytes);
else
qg->reserved -= num_bytes;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index da687dc79cce..72a053c9a7f0 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -549,16 +549,19 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
case Opt_ssd:
btrfs_set_and_info(info, SSD,
"use ssd allocation scheme");
+ btrfs_clear_opt(info->mount_opt, NOSSD);
break;
case Opt_ssd_spread:
btrfs_set_and_info(info, SSD_SPREAD,
"use spread ssd allocation scheme");
btrfs_set_opt(info->mount_opt, SSD);
+ btrfs_clear_opt(info->mount_opt, NOSSD);
break;
case Opt_nossd:
btrfs_set_and_info(info, NOSSD,
"not using ssd allocation scheme");
btrfs_clear_opt(info->mount_opt, SSD);
+ btrfs_clear_opt(info->mount_opt, SSD_SPREAD);
break;
case Opt_barrier:
btrfs_clear_and_info(info, NOBARRIER,
@@ -1133,6 +1136,13 @@ static int btrfs_fill_super(struct super_block *sb,
#endif
sb->s_flags |= MS_I_VERSION;
sb->s_iflags |= SB_I_CGROUPWB;
+
+ err = super_setup_bdi(sb);
+ if (err) {
+ btrfs_err(fs_info, "super_setup_bdi failed");
+ return err;
+ }
+
err = open_ctree(sb, fs_devices, (char *)data);
if (err) {
btrfs_err(fs_info, "open_ctree failed");
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 73d56eef5e60..ab8a66d852f9 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -6213,7 +6213,7 @@ int btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
for (dev_nr = 0; dev_nr < total_devs; dev_nr++) {
dev = bbio->stripes[dev_nr].dev;
if (!dev || !dev->bdev ||
- (bio_op(bio) == REQ_OP_WRITE && !dev->writeable)) {
+ (bio_op(first_bio) == REQ_OP_WRITE && !dev->writeable)) {
bbio_error(bbio, first_bio, logical);
continue;
}
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 1a3e1b40799a..9ecb2fd348cb 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -578,7 +578,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
writeback_stat = atomic_long_inc_return(&fsc->writeback_count);
if (writeback_stat >
CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb))
- set_bdi_congested(&fsc->backing_dev_info, BLK_RW_ASYNC);
+ set_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC);
set_page_writeback(page);
err = ceph_osdc_writepages(osdc, ceph_vino(inode),
@@ -700,7 +700,7 @@ static void writepages_finish(struct ceph_osd_request *req)
if (atomic_long_dec_return(&fsc->writeback_count) <
CONGESTION_OFF_THRESH(
fsc->mount_options->congestion_kb))
- clear_bdi_congested(&fsc->backing_dev_info,
+ clear_bdi_congested(inode_to_bdi(inode),
BLK_RW_ASYNC);
if (rc < 0)
@@ -979,7 +979,7 @@ get_more_pages:
if (atomic_long_inc_return(&fsc->writeback_count) >
CONGESTION_ON_THRESH(
fsc->mount_options->congestion_kb)) {
- set_bdi_congested(&fsc->backing_dev_info,
+ set_bdi_congested(inode_to_bdi(inode),
BLK_RW_ASYNC);
}
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index f2ae393e2c31..3ef11bc8d728 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -251,7 +251,7 @@ int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
goto out;
snprintf(name, sizeof(name), "../../bdi/%s",
- dev_name(fsc->backing_dev_info.dev));
+ dev_name(fsc->sb->s_bdi->dev));
fsc->debugfs_bdi =
debugfs_create_symlink("bdi",
fsc->client->debugfs_dir,
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index d449e1c03cbd..d3119fe3ab45 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -2071,11 +2071,6 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr)
if (inode_dirty_flags)
__mark_inode_dirty(inode, inode_dirty_flags);
- if (ia_valid & ATTR_MODE) {
- err = posix_acl_chmod(inode, attr->ia_mode);
- if (err)
- goto out_put;
- }
if (mask) {
req->r_inode = inode;
@@ -2089,13 +2084,11 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr)
ceph_cap_string(dirtied), mask);
ceph_mdsc_put_request(req);
- if (mask & CEPH_SETATTR_SIZE)
- __ceph_do_pending_vmtruncate(inode);
- ceph_free_cap_flush(prealloc_cf);
- return err;
-out_put:
- ceph_mdsc_put_request(req);
ceph_free_cap_flush(prealloc_cf);
+
+ if (err >= 0 && (mask & CEPH_SETATTR_SIZE))
+ __ceph_do_pending_vmtruncate(inode);
+
return err;
}
@@ -2114,7 +2107,12 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
if (err != 0)
return err;
- return __ceph_setattr(inode, attr);
+ err = __ceph_setattr(inode, attr);
+
+ if (err >= 0 && (attr->ia_valid & ATTR_MODE))
+ err = posix_acl_chmod(inode, attr->ia_mode);
+
+ return err;
}
/*
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 0ec8d0114e57..a8c81b2052ca 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -579,10 +579,6 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
atomic_long_set(&fsc->writeback_count, 0);
- err = bdi_init(&fsc->backing_dev_info);
- if (err < 0)
- goto fail_client;
-
err = -ENOMEM;
/*
* The number of concurrent works can be high but they don't need
@@ -590,7 +586,7 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
*/
fsc->wb_wq = alloc_workqueue("ceph-writeback", 0, 1);
if (fsc->wb_wq == NULL)
- goto fail_bdi;
+ goto fail_client;
fsc->pg_inv_wq = alloc_workqueue("ceph-pg-invalid", 0, 1);
if (fsc->pg_inv_wq == NULL)
goto fail_wb_wq;
@@ -624,8 +620,6 @@ fail_pg_inv_wq:
destroy_workqueue(fsc->pg_inv_wq);
fail_wb_wq:
destroy_workqueue(fsc->wb_wq);
-fail_bdi:
- bdi_destroy(&fsc->backing_dev_info);
fail_client:
ceph_destroy_client(fsc->client);
fail:
@@ -643,8 +637,6 @@ static void destroy_fs_client(struct ceph_fs_client *fsc)
destroy_workqueue(fsc->pg_inv_wq);
destroy_workqueue(fsc->trunc_wq);
- bdi_destroy(&fsc->backing_dev_info);
-
mempool_destroy(fsc->wb_pagevec_pool);
destroy_mount_options(fsc->mount_options);
@@ -937,33 +929,32 @@ static int ceph_compare_super(struct super_block *sb, void *data)
*/
static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
-static int ceph_register_bdi(struct super_block *sb,
- struct ceph_fs_client *fsc)
+static int ceph_setup_bdi(struct super_block *sb, struct ceph_fs_client *fsc)
{
int err;
+ err = super_setup_bdi_name(sb, "ceph-%ld",
+ atomic_long_inc_return(&bdi_seq));
+ if (err)
+ return err;
+
/* set ra_pages based on rasize mount option? */
if (fsc->mount_options->rasize >= PAGE_SIZE)
- fsc->backing_dev_info.ra_pages =
+ sb->s_bdi->ra_pages =
(fsc->mount_options->rasize + PAGE_SIZE - 1)
>> PAGE_SHIFT;
else
- fsc->backing_dev_info.ra_pages =
- VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
+ sb->s_bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
if (fsc->mount_options->rsize > fsc->mount_options->rasize &&
fsc->mount_options->rsize >= PAGE_SIZE)
- fsc->backing_dev_info.io_pages =
+ sb->s_bdi->io_pages =
(fsc->mount_options->rsize + PAGE_SIZE - 1)
>> PAGE_SHIFT;
else if (fsc->mount_options->rsize == 0)
- fsc->backing_dev_info.io_pages = ULONG_MAX;
+ sb->s_bdi->io_pages = ULONG_MAX;
- err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%ld",
- atomic_long_inc_return(&bdi_seq));
- if (!err)
- sb->s_bdi = &fsc->backing_dev_info;
- return err;
+ return 0;
}
static struct dentry *ceph_mount(struct file_system_type *fs_type,
@@ -1018,7 +1009,7 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type,
dout("get_sb got existing client %p\n", fsc);
} else {
dout("get_sb using new client %p\n", fsc);
- err = ceph_register_bdi(sb, fsc);
+ err = ceph_setup_bdi(sb, fsc);
if (err < 0) {
res = ERR_PTR(err);
goto out_splat;
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index fe6b9cfc4013..176186b12457 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -92,8 +92,6 @@ struct ceph_fs_client {
struct workqueue_struct *trunc_wq;
atomic_long_t writeback_count;
- struct backing_dev_info backing_dev_info;
-
#ifdef CONFIG_DEBUG_FS
struct dentry *debugfs_dentry_lru, *debugfs_caps;
struct dentry *debugfs_congestion_kb;
diff --git a/fs/char_dev.c b/fs/char_dev.c
index 44a240c4bb65..fb8507f521b2 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -471,6 +471,85 @@ int cdev_add(struct cdev *p, dev_t dev, unsigned count)
return 0;
}
+/**
+ * cdev_set_parent() - set the parent kobject for a char device
+ * @p: the cdev structure
+ * @kobj: the kobject to take a reference to
+ *
+ * cdev_set_parent() sets a parent kobject which will be referenced
+ * appropriately so the parent is not freed before the cdev. This
+ * should be called before cdev_add.
+ */
+void cdev_set_parent(struct cdev *p, struct kobject *kobj)
+{
+ WARN_ON(!kobj->state_initialized);
+ p->kobj.parent = kobj;
+}
+
+/**
+ * cdev_device_add() - add a char device and it's corresponding
+ * struct device, linkink
+ * @dev: the device structure
+ * @cdev: the cdev structure
+ *
+ * cdev_device_add() adds the char device represented by @cdev to the system,
+ * just as cdev_add does. It then adds @dev to the system using device_add
+ * The dev_t for the char device will be taken from the struct device which
+ * needs to be initialized first. This helper function correctly takes a
+ * reference to the parent device so the parent will not get released until
+ * all references to the cdev are released.
+ *
+ * This helper uses dev->devt for the device number. If it is not set
+ * it will not add the cdev and it will be equivalent to device_add.
+ *
+ * This function should be used whenever the struct cdev and the
+ * struct device are members of the same structure whose lifetime is
+ * managed by the struct device.
+ *
+ * NOTE: Callers must assume that userspace was able to open the cdev and
+ * can call cdev fops callbacks at any time, even if this function fails.
+ */
+int cdev_device_add(struct cdev *cdev, struct device *dev)
+{
+ int rc = 0;
+
+ if (dev->devt) {
+ cdev_set_parent(cdev, &dev->kobj);
+
+ rc = cdev_add(cdev, dev->devt, 1);
+ if (rc)
+ return rc;
+ }
+
+ rc = device_add(dev);
+ if (rc)
+ cdev_del(cdev);
+
+ return rc;
+}
+
+/**
+ * cdev_device_del() - inverse of cdev_device_add
+ * @dev: the device structure
+ * @cdev: the cdev structure
+ *
+ * cdev_device_del() is a helper function to call cdev_del and device_del.
+ * It should be used whenever cdev_device_add is used.
+ *
+ * If dev->devt is not set it will not remove the cdev and will be equivalent
+ * to device_del.
+ *
+ * NOTE: This guarantees that associated sysfs callbacks are not running
+ * or runnable, however any cdevs already open will remain and their fops
+ * will still be callable even after this function returns.
+ */
+void cdev_device_del(struct cdev *cdev, struct device *dev)
+{
+ device_del(dev);
+ if (dev->devt)
+ cdev_del(cdev);
+}
+
static void cdev_unmap(dev_t dev, unsigned count)
{
kobj_unmap(cdev_map, dev, count);
@@ -482,6 +561,10 @@ static void cdev_unmap(dev_t dev, unsigned count)
*
* cdev_del() removes @p from the system, possibly freeing the structure
* itself.
+ *
+ * NOTE: This guarantees that cdev device will no longer be able to be
+ * opened, however any cdevs already open will remain and their fops will
+ * still be callable even after cdev_del returns.
*/
void cdev_del(struct cdev *p)
{
@@ -570,5 +653,8 @@ EXPORT_SYMBOL(cdev_init);
EXPORT_SYMBOL(cdev_alloc);
EXPORT_SYMBOL(cdev_del);
EXPORT_SYMBOL(cdev_add);
+EXPORT_SYMBOL(cdev_set_parent);
+EXPORT_SYMBOL(cdev_device_add);
+EXPORT_SYMBOL(cdev_device_del);
EXPORT_SYMBOL(__register_chrdev);
EXPORT_SYMBOL(__unregister_chrdev);
diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index 07ed81cf1552..cbd216b57239 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -68,7 +68,6 @@ struct cifs_sb_info {
umode_t mnt_dir_mode;
unsigned int mnt_cifs_flags;
char *mountdata; /* options received at mount time or via DFS refs */
- struct backing_dev_info bdi;
struct delayed_work prune_tlinks;
struct rcu_head rcu;
char *prepath;
diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c
index 02b071bf3732..a0b3e7d1be48 100644
--- a/fs/cifs/cifs_unicode.c
+++ b/fs/cifs/cifs_unicode.c
@@ -83,6 +83,9 @@ convert_sfm_char(const __u16 src_char, char *target)
case SFM_COLON:
*target = ':';
break;
+ case SFM_DOUBLEQUOTE:
+ *target = '"';
+ break;
case SFM_ASTERISK:
*target = '*';
break;
@@ -418,6 +421,9 @@ static __le16 convert_to_sfm_char(char src_char, bool end_of_string)
case ':':
dest_char = cpu_to_le16(SFM_COLON);
break;
+ case '"':
+ dest_char = cpu_to_le16(SFM_DOUBLEQUOTE);
+ break;
case '*':
dest_char = cpu_to_le16(SFM_ASTERISK);
break;
diff --git a/fs/cifs/cifs_unicode.h b/fs/cifs/cifs_unicode.h
index 3d7298cc0aeb..8a79a34e66b8 100644
--- a/fs/cifs/cifs_unicode.h
+++ b/fs/cifs/cifs_unicode.h
@@ -57,6 +57,7 @@
* not conflict (although almost does) with the mapping above.
*/
+#define SFM_DOUBLEQUOTE ((__u16) 0xF020)
#define SFM_ASTERISK ((__u16) 0xF021)
#define SFM_QUESTION ((__u16) 0xF025)
#define SFM_COLON ((__u16) 0xF022)
@@ -64,8 +65,8 @@
#define SFM_LESSTHAN ((__u16) 0xF023)
#define SFM_PIPE ((__u16) 0xF027)
#define SFM_SLASH ((__u16) 0xF026)
-#define SFM_PERIOD ((__u16) 0xF028)
-#define SFM_SPACE ((__u16) 0xF029)
+#define SFM_SPACE ((__u16) 0xF028)
+#define SFM_PERIOD ((__u16) 0xF029)
/*
* Mapping mechanism to use when one of the seven reserved characters is
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index dd3f5fabfdf6..9a1667e0e8d6 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -37,6 +37,7 @@
#include <linux/freezer.h>
#include <linux/namei.h>
#include <linux/random.h>
+#include <linux/uuid.h>
#include <linux/xattr.h>
#include <net/ipv6.h>
#include "cifsfs.h"
@@ -87,6 +88,7 @@ extern mempool_t *cifs_req_poolp;
extern mempool_t *cifs_mid_poolp;
struct workqueue_struct *cifsiod_wq;
+struct workqueue_struct *cifsoplockd_wq;
__u32 cifs_lock_secret;
/*
@@ -138,7 +140,12 @@ cifs_read_super(struct super_block *sb)
sb->s_magic = CIFS_MAGIC_NUMBER;
sb->s_op = &cifs_super_ops;
sb->s_xattr = cifs_xattr_handlers;
- sb->s_bdi = &cifs_sb->bdi;
+ rc = super_setup_bdi(sb);
+ if (rc)
+ goto out_no_root;
+ /* tune readahead according to rsize */
+ sb->s_bdi->ra_pages = cifs_sb->rsize / PAGE_SIZE;
+
sb->s_blocksize = CIFS_MAX_MSGSIZE;
sb->s_blocksize_bits = 14; /* default 2**14 = CIFS_MAX_MSGSIZE */
inode = cifs_root_iget(sb);
@@ -1369,9 +1376,16 @@ init_cifs(void)
goto out_clean_proc;
}
+ cifsoplockd_wq = alloc_workqueue("cifsoplockd",
+ WQ_FREEZABLE|WQ_MEM_RECLAIM, 0);
+ if (!cifsoplockd_wq) {
+ rc = -ENOMEM;
+ goto out_destroy_cifsiod_wq;
+ }
+
rc = cifs_fscache_register();
if (rc)
- goto out_destroy_wq;
+ goto out_destroy_cifsoplockd_wq;
rc = cifs_init_inodecache();
if (rc)
@@ -1419,7 +1433,9 @@ out_destroy_inodecache:
cifs_destroy_inodecache();
out_unreg_fscache:
cifs_fscache_unregister();
-out_destroy_wq:
+out_destroy_cifsoplockd_wq:
+ destroy_workqueue(cifsoplockd_wq);
+out_destroy_cifsiod_wq:
destroy_workqueue(cifsiod_wq);
out_clean_proc:
cifs_proc_clean();
@@ -1442,6 +1458,7 @@ exit_cifs(void)
cifs_destroy_mids();
cifs_destroy_inodecache();
cifs_fscache_unregister();
+ destroy_workqueue(cifsoplockd_wq);
destroy_workqueue(cifsiod_wq);
cifs_proc_clean();
}
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index d07f13a63369..8be55be70faf 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -948,7 +948,6 @@ struct cifs_tcon {
bool use_persistent:1; /* use persistent instead of durable handles */
#ifdef CONFIG_CIFS_SMB2
bool print:1; /* set if connection to printer share */
- bool bad_network_name:1; /* set if ret status STATUS_BAD_NETWORK_NAME */
__le32 capabilities;
__u32 share_flags;
__u32 maximal_access;
@@ -1116,6 +1115,23 @@ struct cifs_io_parms {
struct cifs_tcon *tcon;
};
+struct cifs_aio_ctx {
+ struct kref refcount;
+ struct list_head list;
+ struct mutex aio_mutex;
+ struct completion done;
+ struct iov_iter iter;
+ struct kiocb *iocb;
+ struct cifsFileInfo *cfile;
+ struct bio_vec *bv;
+ loff_t pos;
+ unsigned int npages;
+ ssize_t rc;
+ unsigned int len;
+ unsigned int total_len;
+ bool should_dirty;
+};
+
struct cifs_readdata;
/* asynchronous read support */
@@ -1125,6 +1141,7 @@ struct cifs_readdata {
struct completion done;
struct cifsFileInfo *cfile;
struct address_space *mapping;
+ struct cifs_aio_ctx *ctx;
__u64 offset;
unsigned int bytes;
unsigned int got_bytes;
@@ -1155,6 +1172,7 @@ struct cifs_writedata {
enum writeback_sync_modes sync_mode;
struct work_struct work;
struct cifsFileInfo *cfile;
+ struct cifs_aio_ctx *ctx;
__u64 offset;
pid_t pid;
unsigned int bytes;
@@ -1684,6 +1702,7 @@ void cifs_oplock_break(struct work_struct *work);
extern const struct slow_work_ops cifs_oplock_break_ops;
extern struct workqueue_struct *cifsiod_wq;
+extern struct workqueue_struct *cifsoplockd_wq;
extern __u32 cifs_lock_secret;
extern mempool_t *cifs_mid_poolp;
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index ec5e5e514fdd..e49958c3f8bb 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -79,8 +79,7 @@ extern void cifs_delete_mid(struct mid_q_entry *mid);
extern void cifs_wake_up_task(struct mid_q_entry *mid);
extern int cifs_handle_standard(struct TCP_Server_Info *server,
struct mid_q_entry *mid);
-extern int cifs_discard_remaining_data(struct TCP_Server_Info *server,
- char *buf);
+extern int cifs_discard_remaining_data(struct TCP_Server_Info *server);
extern int cifs_call_async(struct TCP_Server_Info *server,
struct smb_rqst *rqst,
mid_receive_t *receive, mid_callback_t *callback,
@@ -536,4 +535,7 @@ int __cifs_calc_signature(struct smb_rqst *rqst,
struct shash_desc *shash);
enum securityEnum cifs_select_sectype(struct TCP_Server_Info *,
enum securityEnum);
+struct cifs_aio_ctx *cifs_aio_ctx_alloc(void);
+void cifs_aio_ctx_release(struct kref *refcount);
+int setup_aio_ctx_iter(struct cifs_aio_ctx *ctx, struct iov_iter *iter, int rw);
#endif /* _CIFSPROTO_H */
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 967b92631807..205fd94f52fd 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -718,6 +718,9 @@ CIFSSMBEcho(struct TCP_Server_Info *server)
if (rc)
return rc;
+ if (server->capabilities & CAP_UNICODE)
+ smb->hdr.Flags2 |= SMBFLG2_UNICODE;
+
/* set up echo request */
smb->hdr.Tid = 0xffff;
smb->hdr.WordCount = 1;
@@ -1400,9 +1403,9 @@ openRetry:
* current bigbuf.
*/
int
-cifs_discard_remaining_data(struct TCP_Server_Info *server, char *buf)
+cifs_discard_remaining_data(struct TCP_Server_Info *server)
{
- unsigned int rfclen = get_rfc1002_length(buf);
+ unsigned int rfclen = get_rfc1002_length(server->smallbuf);
int remaining = rfclen + 4 - server->total_read;
while (remaining > 0) {
@@ -1426,8 +1429,10 @@ cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid)
int length;
struct cifs_readdata *rdata = mid->callback_data;
- length = cifs_discard_remaining_data(server, mid->resp_buf);
+ length = cifs_discard_remaining_data(server);
dequeue_mid(mid, rdata->result);
+ mid->resp_buf = server->smallbuf;
+ server->smallbuf = NULL;
return length;
}
@@ -1459,7 +1464,7 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
if (server->ops->is_status_pending &&
server->ops->is_status_pending(buf, server, 0)) {
- cifs_discard_remaining_data(server, buf);
+ cifs_discard_remaining_data(server);
return -1;
}
@@ -1519,9 +1524,6 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
cifs_dbg(FYI, "0: iov_base=%p iov_len=%u\n",
rdata->iov[0].iov_base, server->total_read);
- mid->resp_buf = server->smallbuf;
- server->smallbuf = NULL;
-
/* how much data is in the response? */
data_len = server->ops->read_data_length(buf);
if (data_offset + data_len > buflen) {
@@ -1544,6 +1546,8 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
return cifs_readv_discard(server, mid);
dequeue_mid(mid, false);
+ mid->resp_buf = server->smallbuf;
+ server->smallbuf = NULL;
return length;
}
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 0c7596cef4b8..9365c0cf77ad 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -35,6 +35,7 @@
#include <linux/pagevec.h>
#include <linux/freezer.h>
#include <linux/namei.h>
+#include <linux/uuid.h>
#include <linux/uaccess.h>
#include <asm/processor.h>
#include <linux/inet.h>
@@ -1945,9 +1946,14 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
}
if (!got_ip) {
+ int len;
+ const char *slash;
+
/* No ip= option specified? Try to get it from UNC */
- if (!cifs_convert_address(dstaddr, &vol->UNC[2],
- strlen(&vol->UNC[2]))) {
+ /* Use the address part of the UNC. */
+ slash = strchr(&vol->UNC[2], '\\');
+ len = slash - &vol->UNC[2];
+ if (!cifs_convert_address(dstaddr, &vol->UNC[2], len)) {
pr_err("Unable to determine destination address.\n");
goto cifs_parse_mount_err;
}
@@ -2912,16 +2918,14 @@ match_prepath(struct super_block *sb, struct cifs_mnt_data *mnt_data)
{
struct cifs_sb_info *old = CIFS_SB(sb);
struct cifs_sb_info *new = mnt_data->cifs_sb;
+ bool old_set = old->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH;
+ bool new_set = new->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH;
- if (old->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH) {
- if (!(new->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH))
- return 0;
- /* The prepath should be null terminated strings */
- if (strcmp(new->prepath, old->prepath))
- return 0;
-
+ if (old_set && new_set && !strcmp(new->prepath, old->prepath))
return 1;
- }
+ else if (!old_set && !new_set)
+ return 1;
+
return 0;
}
@@ -3692,10 +3696,6 @@ cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *volume_info)
int referral_walks_count = 0;
#endif
- rc = bdi_setup_and_register(&cifs_sb->bdi, "cifs");
- if (rc)
- return rc;
-
#ifdef CONFIG_CIFS_DFS_UPCALL
try_mount_again:
/* cleanup activities if we're chasing a referral */
@@ -3723,7 +3723,6 @@ try_mount_again:
server = cifs_get_tcp_session(volume_info);
if (IS_ERR(server)) {
rc = PTR_ERR(server);
- bdi_destroy(&cifs_sb->bdi);
goto out;
}
if ((volume_info->max_credits < 20) ||
@@ -3753,6 +3752,9 @@ try_mount_again:
if (IS_ERR(tcon)) {
rc = PTR_ERR(tcon);
tcon = NULL;
+ if (rc == -EACCES)
+ goto mount_fail_check;
+
goto remote_path_check;
}
@@ -3777,9 +3779,6 @@ try_mount_again:
cifs_sb->wsize = server->ops->negotiate_wsize(tcon, volume_info);
cifs_sb->rsize = server->ops->negotiate_rsize(tcon, volume_info);
- /* tune readahead according to rsize */
- cifs_sb->bdi.ra_pages = cifs_sb->rsize / PAGE_SIZE;
-
remote_path_check:
#ifdef CONFIG_CIFS_DFS_UPCALL
/*
@@ -3896,7 +3895,6 @@ mount_fail_check:
cifs_put_smb_ses(ses);
else
cifs_put_tcp_session(server, 0);
- bdi_destroy(&cifs_sb->bdi);
}
out:
@@ -4099,7 +4097,6 @@ cifs_umount(struct cifs_sb_info *cifs_sb)
}
spin_unlock(&cifs_sb->tlink_tree_lock);
- bdi_destroy(&cifs_sb->bdi);
kfree(cifs_sb->mountdata);
kfree(cifs_sb->prepath);
call_rcu(&cifs_sb->rcu, delayed_free);
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index aa3debbba826..6ef78ad838e6 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -2458,11 +2458,14 @@ cifs_uncached_writedata_release(struct kref *refcount)
struct cifs_writedata *wdata = container_of(refcount,
struct cifs_writedata, refcount);
+ kref_put(&wdata->ctx->refcount, cifs_aio_ctx_release);
for (i = 0; i < wdata->nr_pages; i++)
put_page(wdata->pages[i]);
cifs_writedata_release(refcount);
}
+static void collect_uncached_write_data(struct cifs_aio_ctx *ctx);
+
static void
cifs_uncached_writev_complete(struct work_struct *work)
{
@@ -2478,7 +2481,8 @@ cifs_uncached_writev_complete(struct work_struct *work)
spin_unlock(&inode->i_lock);
complete(&wdata->done);
-
+ collect_uncached_write_data(wdata->ctx);
+ /* the below call can possibly free the last ref to aio ctx */
kref_put(&wdata->refcount, cifs_uncached_writedata_release);
}
@@ -2527,7 +2531,8 @@ wdata_fill_from_iovec(struct cifs_writedata *wdata, struct iov_iter *from,
static int
cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from,
struct cifsFileInfo *open_file,
- struct cifs_sb_info *cifs_sb, struct list_head *wdata_list)
+ struct cifs_sb_info *cifs_sb, struct list_head *wdata_list,
+ struct cifs_aio_ctx *ctx)
{
int rc = 0;
size_t cur_len;
@@ -2595,9 +2600,11 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from,
wdata->pagesz = PAGE_SIZE;
wdata->tailsz = cur_len - ((nr_pages - 1) * PAGE_SIZE);
wdata->credits = credits;
+ wdata->ctx = ctx;
+ kref_get(&ctx->refcount);
if (!wdata->cfile->invalidHandle ||
- !cifs_reopen_file(wdata->cfile, false))
+ !(rc = cifs_reopen_file(wdata->cfile, false)))
rc = server->ops->async_writev(wdata,
cifs_uncached_writedata_release);
if (rc) {
@@ -2620,81 +2627,61 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from,
return rc;
}
-ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from)
+static void collect_uncached_write_data(struct cifs_aio_ctx *ctx)
{
- struct file *file = iocb->ki_filp;
- ssize_t total_written = 0;
- struct cifsFileInfo *open_file;
+ struct cifs_writedata *wdata, *tmp;
struct cifs_tcon *tcon;
struct cifs_sb_info *cifs_sb;
- struct cifs_writedata *wdata, *tmp;
- struct list_head wdata_list;
- struct iov_iter saved_from = *from;
+ struct dentry *dentry = ctx->cfile->dentry;
+ unsigned int i;
int rc;
- /*
- * BB - optimize the way when signing is disabled. We can drop this
- * extra memory-to-memory copying and use iovec buffers for constructing
- * write request.
- */
-
- rc = generic_write_checks(iocb, from);
- if (rc <= 0)
- return rc;
-
- INIT_LIST_HEAD(&wdata_list);
- cifs_sb = CIFS_FILE_SB(file);
- open_file = file->private_data;
- tcon = tlink_tcon(open_file->tlink);
-
- if (!tcon->ses->server->ops->async_writev)
- return -ENOSYS;
+ tcon = tlink_tcon(ctx->cfile->tlink);
+ cifs_sb = CIFS_SB(dentry->d_sb);
- rc = cifs_write_from_iter(iocb->ki_pos, iov_iter_count(from), from,
- open_file, cifs_sb, &wdata_list);
+ mutex_lock(&ctx->aio_mutex);
- /*
- * If at least one write was successfully sent, then discard any rc
- * value from the later writes. If the other write succeeds, then
- * we'll end up returning whatever was written. If it fails, then
- * we'll get a new rc value from that.
- */
- if (!list_empty(&wdata_list))
- rc = 0;
+ if (list_empty(&ctx->list)) {
+ mutex_unlock(&ctx->aio_mutex);
+ return;
+ }
+ rc = ctx->rc;
/*
* Wait for and collect replies for any successful sends in order of
- * increasing offset. Once an error is hit or we get a fatal signal
- * while waiting, then return without waiting for any more replies.
+ * increasing offset. Once an error is hit, then return without waiting
+ * for any more replies.
*/
restart_loop:
- list_for_each_entry_safe(wdata, tmp, &wdata_list, list) {
+ list_for_each_entry_safe(wdata, tmp, &ctx->list, list) {
if (!rc) {
- /* FIXME: freezable too? */
- rc = wait_for_completion_killable(&wdata->done);
- if (rc)
- rc = -EINTR;
- else if (wdata->result)
+ if (!try_wait_for_completion(&wdata->done)) {
+ mutex_unlock(&ctx->aio_mutex);
+ return;
+ }
+
+ if (wdata->result)
rc = wdata->result;
else
- total_written += wdata->bytes;
+ ctx->total_len += wdata->bytes;
/* resend call if it's a retryable error */
if (rc == -EAGAIN) {
struct list_head tmp_list;
- struct iov_iter tmp_from = saved_from;
+ struct iov_iter tmp_from = ctx->iter;
INIT_LIST_HEAD(&tmp_list);
list_del_init(&wdata->list);
iov_iter_advance(&tmp_from,
- wdata->offset - iocb->ki_pos);
+ wdata->offset - ctx->pos);
rc = cifs_write_from_iter(wdata->offset,
wdata->bytes, &tmp_from,
- open_file, cifs_sb, &tmp_list);
+ ctx->cfile, cifs_sb, &tmp_list,
+ ctx);
- list_splice(&tmp_list, &wdata_list);
+ list_splice(&tmp_list, &ctx->list);
kref_put(&wdata->refcount,
cifs_uncached_writedata_release);
@@ -2705,12 +2692,111 @@ restart_loop:
kref_put(&wdata->refcount, cifs_uncached_writedata_release);
}
+ for (i = 0; i < ctx->npages; i++)
+ put_page(ctx->bv[i].bv_page);
+
+ cifs_stats_bytes_written(tcon, ctx->total_len);
+ set_bit(CIFS_INO_INVALID_MAPPING, &CIFS_I(dentry->d_inode)->flags);
+
+ ctx->rc = (rc == 0) ? ctx->total_len : rc;
+
+ mutex_unlock(&ctx->aio_mutex);
+
+ if (ctx->iocb && ctx->iocb->ki_complete)
+ ctx->iocb->ki_complete(ctx->iocb, ctx->rc, 0);
+ else
+ complete(&ctx->done);
+}
+
+ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct file *file = iocb->ki_filp;
+ ssize_t total_written = 0;
+ struct cifsFileInfo *cfile;
+ struct cifs_tcon *tcon;
+ struct cifs_sb_info *cifs_sb;
+ struct cifs_aio_ctx *ctx;
+ struct iov_iter saved_from = *from;
+ int rc;
+
+ /*
+ * BB - optimize the way when signing is disabled. We can drop this
+ * extra memory-to-memory copying and use iovec buffers for constructing
+ * write request.
+ */
+
+ rc = generic_write_checks(iocb, from);
+ if (rc <= 0)
+ return rc;
+
+ cifs_sb = CIFS_FILE_SB(file);
+ cfile = file->private_data;
+ tcon = tlink_tcon(cfile->tlink);
+
+ if (!tcon->ses->server->ops->async_writev)
+ return -ENOSYS;
+
+ ctx = cifs_aio_ctx_alloc();
+ if (!ctx)
+ return -ENOMEM;
+
+ ctx->cfile = cifsFileInfo_get(cfile);
+
+ if (!is_sync_kiocb(iocb))
+ ctx->iocb = iocb;
+
+ ctx->pos = iocb->ki_pos;
+
+ rc = setup_aio_ctx_iter(ctx, from, WRITE);
+ if (rc) {
+ kref_put(&ctx->refcount, cifs_aio_ctx_release);
+ return rc;
+ }
+
+ /* grab a lock here due to read response handlers can access ctx */
+ mutex_lock(&ctx->aio_mutex);
+
+ rc = cifs_write_from_iter(iocb->ki_pos, ctx->len, &saved_from,
+ cfile, cifs_sb, &ctx->list, ctx);
+
+ /*
+ * If at least one write was successfully sent, then discard any rc
+ * value from the later writes. If the other write succeeds, then
+ * we'll end up returning whatever was written. If it fails, then
+ * we'll get a new rc value from that.
+ */
+ if (!list_empty(&ctx->list))
+ rc = 0;
+
+ mutex_unlock(&ctx->aio_mutex);
+
+ if (rc) {
+ kref_put(&ctx->refcount, cifs_aio_ctx_release);
+ return rc;
+ }
+
+ if (!is_sync_kiocb(iocb)) {
+ kref_put(&ctx->refcount, cifs_aio_ctx_release);
+ return -EIOCBQUEUED;
+ }
+
+ rc = wait_for_completion_killable(&ctx->done);
+ if (rc) {
+ mutex_lock(&ctx->aio_mutex);
+ ctx->rc = rc = -EINTR;
+ total_written = ctx->total_len;
+ mutex_unlock(&ctx->aio_mutex);
+ } else {
+ rc = ctx->rc;
+ total_written = ctx->total_len;
+ }
+
+ kref_put(&ctx->refcount, cifs_aio_ctx_release);
+
if (unlikely(!total_written))
return rc;
iocb->ki_pos += total_written;
- set_bit(CIFS_INO_INVALID_MAPPING, &CIFS_I(file_inode(file))->flags);
- cifs_stats_bytes_written(tcon, total_written);
return total_written;
}
@@ -2859,6 +2945,7 @@ cifs_uncached_readdata_release(struct kref *refcount)
struct cifs_readdata, refcount);
unsigned int i;
+ kref_put(&rdata->ctx->refcount, cifs_aio_ctx_release);
for (i = 0; i < rdata->nr_pages; i++) {
put_page(rdata->pages[i]);
rdata->pages[i] = NULL;
@@ -2900,6 +2987,8 @@ cifs_readdata_to_iov(struct cifs_readdata *rdata, struct iov_iter *iter)
return remaining ? -EFAULT : 0;
}
+static void collect_uncached_read_data(struct cifs_aio_ctx *ctx);
+
static void
cifs_uncached_readv_complete(struct work_struct *work)
{
@@ -2907,6 +2996,8 @@ cifs_uncached_readv_complete(struct work_struct *work)
struct cifs_readdata, work);
complete(&rdata->done);
+ collect_uncached_read_data(rdata->ctx);
+ /* the below call can possibly free the last ref to aio ctx */
kref_put(&rdata->refcount, cifs_uncached_readdata_release);
}
@@ -2973,7 +3064,8 @@ cifs_uncached_copy_into_pages(struct TCP_Server_Info *server,
static int
cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file,
- struct cifs_sb_info *cifs_sb, struct list_head *rdata_list)
+ struct cifs_sb_info *cifs_sb, struct list_head *rdata_list,
+ struct cifs_aio_ctx *ctx)
{
struct cifs_readdata *rdata;
unsigned int npages, rsize, credits;
@@ -3020,9 +3112,11 @@ cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file,
rdata->read_into_pages = cifs_uncached_read_into_pages;
rdata->copy_into_pages = cifs_uncached_copy_into_pages;
rdata->credits = credits;
+ rdata->ctx = ctx;
+ kref_get(&ctx->refcount);
if (!rdata->cfile->invalidHandle ||
- !cifs_reopen_file(rdata->cfile, true))
+ !(rc = cifs_reopen_file(rdata->cfile, true)))
rc = server->ops->async_readv(rdata);
error:
if (rc) {
@@ -3042,50 +3136,37 @@ error:
return rc;
}
-ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to)
+static void
+collect_uncached_read_data(struct cifs_aio_ctx *ctx)
{
- struct file *file = iocb->ki_filp;
- ssize_t rc;
- size_t len;
- ssize_t total_read = 0;
- loff_t offset = iocb->ki_pos;
+ struct cifs_readdata *rdata, *tmp;
+ struct iov_iter *to = &ctx->iter;
struct cifs_sb_info *cifs_sb;
struct cifs_tcon *tcon;
- struct cifsFileInfo *open_file;
- struct cifs_readdata *rdata, *tmp;
- struct list_head rdata_list;
-
- len = iov_iter_count(to);
- if (!len)
- return 0;
-
- INIT_LIST_HEAD(&rdata_list);
- cifs_sb = CIFS_FILE_SB(file);
- open_file = file->private_data;
- tcon = tlink_tcon(open_file->tlink);
-
- if (!tcon->ses->server->ops->async_readv)
- return -ENOSYS;
+ unsigned int i;
+ int rc;
- if ((file->f_flags & O_ACCMODE) == O_WRONLY)
- cifs_dbg(FYI, "attempting read on write only file instance\n");
+ tcon = tlink_tcon(ctx->cfile->tlink);
+ cifs_sb = CIFS_SB(ctx->cfile->dentry->d_sb);
- rc = cifs_send_async_read(offset, len, open_file, cifs_sb, &rdata_list);
+ mutex_lock(&ctx->aio_mutex);
- /* if at least one read request send succeeded, then reset rc */
- if (!list_empty(&rdata_list))
- rc = 0;
+ if (list_empty(&ctx->list)) {
+ mutex_unlock(&ctx->aio_mutex);
+ return;
+ }
- len = iov_iter_count(to);
+ rc = ctx->rc;
/* the loop below should proceed in the order of increasing offsets */
again:
- list_for_each_entry_safe(rdata, tmp, &rdata_list, list) {
+ list_for_each_entry_safe(rdata, tmp, &ctx->list, list) {
if (!rc) {
- /* FIXME: freezable sleep too? */
- rc = wait_for_completion_killable(&rdata->done);
- if (rc)
- rc = -EINTR;
- else if (rdata->result == -EAGAIN) {
+ if (!try_wait_for_completion(&rdata->done)) {
+ mutex_unlock(&ctx->aio_mutex);
+ return;
+ }
+
+ if (rdata->result == -EAGAIN) {
/* resend call if it's a retryable error */
struct list_head tmp_list;
unsigned int got_bytes = rdata->got_bytes;
@@ -3111,9 +3192,9 @@ again:
rdata->offset + got_bytes,
rdata->bytes - got_bytes,
rdata->cfile, cifs_sb,
- &tmp_list);
+ &tmp_list, ctx);
- list_splice(&tmp_list, &rdata_list);
+ list_splice(&tmp_list, &ctx->list);
kref_put(&rdata->refcount,
cifs_uncached_readdata_release);
@@ -3131,14 +3212,110 @@ again:
kref_put(&rdata->refcount, cifs_uncached_readdata_release);
}
- total_read = len - iov_iter_count(to);
+ for (i = 0; i < ctx->npages; i++) {
+ if (ctx->should_dirty)
+ set_page_dirty(ctx->bv[i].bv_page);
+ put_page(ctx->bv[i].bv_page);
+ }
+
+ ctx->total_len = ctx->len - iov_iter_count(to);
- cifs_stats_bytes_read(tcon, total_read);
+ cifs_stats_bytes_read(tcon, ctx->total_len);
/* mask nodata case */
if (rc == -ENODATA)
rc = 0;
+ ctx->rc = (rc == 0) ? ctx->total_len : rc;
+
+ mutex_unlock(&ctx->aio_mutex);
+
+ if (ctx->iocb && ctx->iocb->ki_complete)
+ ctx->iocb->ki_complete(ctx->iocb, ctx->rc, 0);
+ else
+ complete(&ctx->done);
+}
+
+ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to)
+{
+ struct file *file = iocb->ki_filp;
+ ssize_t rc;
+ size_t len;
+ ssize_t total_read = 0;
+ loff_t offset = iocb->ki_pos;
+ struct cifs_sb_info *cifs_sb;
+ struct cifs_tcon *tcon;
+ struct cifsFileInfo *cfile;
+ struct cifs_aio_ctx *ctx;
+
+ len = iov_iter_count(to);
+ if (!len)
+ return 0;
+
+ cifs_sb = CIFS_FILE_SB(file);
+ cfile = file->private_data;
+ tcon = tlink_tcon(cfile->tlink);
+
+ if (!tcon->ses->server->ops->async_readv)
+ return -ENOSYS;
+
+ if ((file->f_flags & O_ACCMODE) == O_WRONLY)
+ cifs_dbg(FYI, "attempting read on write only file instance\n");
+
+ ctx = cifs_aio_ctx_alloc();
+ if (!ctx)
+ return -ENOMEM;
+
+ ctx->cfile = cifsFileInfo_get(cfile);
+
+ if (!is_sync_kiocb(iocb))
+ ctx->iocb = iocb;
+
+ if (to->type & ITER_IOVEC)
+ ctx->should_dirty = true;
+
+ rc = setup_aio_ctx_iter(ctx, to, READ);
+ if (rc) {
+ kref_put(&ctx->refcount, cifs_aio_ctx_release);
+ return rc;
+ }
+
+ len = ctx->len;
+
+ /* grab a lock here due to read response handlers can access ctx */
+ mutex_lock(&ctx->aio_mutex);
+
+ rc = cifs_send_async_read(offset, len, cfile, cifs_sb, &ctx->list, ctx);
+
+ /* if at least one read request send succeeded, then reset rc */
+ if (!list_empty(&ctx->list))
+ rc = 0;
+
+ mutex_unlock(&ctx->aio_mutex);
+
+ if (rc) {
+ kref_put(&ctx->refcount, cifs_aio_ctx_release);
+ return rc;
+ }
+
+ if (!is_sync_kiocb(iocb)) {
+ kref_put(&ctx->refcount, cifs_aio_ctx_release);
+ return -EIOCBQUEUED;
+ }
+
+ rc = wait_for_completion_killable(&ctx->done);
+ if (rc) {
+ mutex_lock(&ctx->aio_mutex);
+ ctx->rc = rc = -EINTR;
+ total_read = ctx->total_len;
+ mutex_unlock(&ctx->aio_mutex);
+ } else {
+ rc = ctx->rc;
+ total_read = ctx->total_len;
+ }
+
+ kref_put(&ctx->refcount, cifs_aio_ctx_release);
+
if (total_read) {
iocb->ki_pos += total_read;
return total_read;
@@ -3617,7 +3794,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
}
if (!rdata->cfile->invalidHandle ||
- !cifs_reopen_file(rdata->cfile, true))
+ !(rc = cifs_reopen_file(rdata->cfile, true)))
rc = server->ops->async_readv(rdata);
if (rc) {
add_credits_and_wake_if(server, rdata->credits, 0);
diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c
index 265c45fe4ea5..76fb0917dc8c 100644
--- a/fs/cifs/ioctl.c
+++ b/fs/cifs/ioctl.c
@@ -74,7 +74,8 @@ static long cifs_ioctl_copychunk(unsigned int xid, struct file *dst_file,
rc = cifs_file_copychunk_range(xid, src_file.file, 0, dst_file, 0,
src_inode->i_size, 0);
-
+ if (rc > 0)
+ rc = 0;
out_fput:
fdput(src_file);
out_drop_write:
@@ -208,10 +209,14 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
rc = -EOPNOTSUPP;
break;
case CIFS_IOC_GET_MNT_INFO:
+ if (pSMBFile == NULL)
+ break;
tcon = tlink_tcon(pSMBFile->tlink);
rc = smb_mnt_get_fsinfo(xid, tcon, (void __user *)arg);
break;
case CIFS_ENUMERATE_SNAPSHOTS:
+ if (pSMBFile == NULL)
+ break;
if (arg == 0) {
rc = -EINVAL;
goto cifs_ioc_exit;
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index d3fb11529ed9..b08531977daa 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -22,6 +22,7 @@
#include <linux/slab.h>
#include <linux/ctype.h>
#include <linux/mempool.h>
+#include <linux/vmalloc.h>
#include "cifspdu.h"
#include "cifsglob.h"
#include "cifsproto.h"
@@ -167,13 +168,11 @@ cifs_buf_get(void)
/* clear the first few header bytes */
/* for most paths, more is cleared in header_assemble */
- if (ret_buf) {
- memset(ret_buf, 0, buf_size + 3);
- atomic_inc(&bufAllocCount);
+ memset(ret_buf, 0, buf_size + 3);
+ atomic_inc(&bufAllocCount);
#ifdef CONFIG_CIFS_STATS2
- atomic_inc(&totBufAllocCount);
+ atomic_inc(&totBufAllocCount);
#endif /* CONFIG_CIFS_STATS2 */
- }
return ret_buf;
}
@@ -201,15 +200,13 @@ cifs_small_buf_get(void)
albeit slightly larger than necessary and maxbuffersize
defaults to this and can not be bigger */
ret_buf = mempool_alloc(cifs_sm_req_poolp, GFP_NOFS);
- if (ret_buf) {
/* No need to clear memory here, cleared in header assemble */
/* memset(ret_buf, 0, sizeof(struct smb_hdr) + 27);*/
- atomic_inc(&smBufAllocCount);
+ atomic_inc(&smBufAllocCount);
#ifdef CONFIG_CIFS_STATS2
- atomic_inc(&totSmBufAllocCount);
+ atomic_inc(&totSmBufAllocCount);
#endif /* CONFIG_CIFS_STATS2 */
- }
return ret_buf;
}
@@ -492,7 +489,7 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv)
CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2,
&pCifsInode->flags);
- queue_work(cifsiod_wq,
+ queue_work(cifsoplockd_wq,
&netfile->oplock_break);
netfile->oplock_break_cancelled = false;
@@ -745,3 +742,122 @@ parse_DFS_referrals_exit:
}
return rc;
}
+
+struct cifs_aio_ctx *
+cifs_aio_ctx_alloc(void)
+{
+ struct cifs_aio_ctx *ctx;
+
+ ctx = kzalloc(sizeof(struct cifs_aio_ctx), GFP_KERNEL);
+ if (!ctx)
+ return NULL;
+
+ INIT_LIST_HEAD(&ctx->list);
+ mutex_init(&ctx->aio_mutex);
+ init_completion(&ctx->done);
+ kref_init(&ctx->refcount);
+ return ctx;
+}
+
+void
+cifs_aio_ctx_release(struct kref *refcount)
+{
+ struct cifs_aio_ctx *ctx = container_of(refcount,
+ struct cifs_aio_ctx, refcount);
+
+ cifsFileInfo_put(ctx->cfile);
+ kvfree(ctx->bv);
+ kfree(ctx);
+}
+
+#define CIFS_AIO_KMALLOC_LIMIT (1024 * 1024)
+
+int
+setup_aio_ctx_iter(struct cifs_aio_ctx *ctx, struct iov_iter *iter, int rw)
+{
+ ssize_t rc;
+ unsigned int cur_npages;
+ unsigned int npages = 0;
+ unsigned int i;
+ size_t len;
+ size_t count = iov_iter_count(iter);
+ unsigned int saved_len;
+ size_t start;
+ unsigned int max_pages = iov_iter_npages(iter, INT_MAX);
+ struct page **pages = NULL;
+ struct bio_vec *bv = NULL;
+
+ if (iter->type & ITER_KVEC) {
+ memcpy(&ctx->iter, iter, sizeof(struct iov_iter));
+ ctx->len = count;
+ iov_iter_advance(iter, count);
+ return 0;
+ }
+
+ if (max_pages * sizeof(struct bio_vec) <= CIFS_AIO_KMALLOC_LIMIT)
+ bv = kmalloc_array(max_pages, sizeof(struct bio_vec),
+ GFP_KERNEL);
+
+ if (!bv) {
+ bv = vmalloc(max_pages * sizeof(struct bio_vec));
+ if (!bv)
+ return -ENOMEM;
+ }
+
+ if (max_pages * sizeof(struct page *) <= CIFS_AIO_KMALLOC_LIMIT)
+ pages = kmalloc_array(max_pages, sizeof(struct page *),
+ GFP_KERNEL);
+
+ if (!pages) {
+ pages = vmalloc(max_pages * sizeof(struct page *));
+ if (!bv) {
+ kvfree(bv);
+ return -ENOMEM;
+ }
+ }
+
+ saved_len = count;
+
+ while (count && npages < max_pages) {
+ rc = iov_iter_get_pages(iter, pages, count, max_pages, &start);
+ if (rc < 0) {
+ cifs_dbg(VFS, "couldn't get user pages (rc=%zd)\n", rc);
+ break;
+ }
+
+ if (rc > count) {
+ cifs_dbg(VFS, "get pages rc=%zd more than %zu\n", rc,
+ count);
+ break;
+ }
+
+ iov_iter_advance(iter, rc);
+ count -= rc;
+ rc += start;
+ cur_npages = DIV_ROUND_UP(rc, PAGE_SIZE);
+
+ if (npages + cur_npages > max_pages) {
+ cifs_dbg(VFS, "out of vec array capacity (%u vs %u)\n",
+ npages + cur_npages, max_pages);
+ break;
+ }
+
+ for (i = 0; i < cur_npages; i++) {
+ len = rc > PAGE_SIZE ? PAGE_SIZE : rc;
+ bv[npages + i].bv_page = pages[i];
+ bv[npages + i].bv_offset = start;
+ bv[npages + i].bv_len = len - start;
+ rc -= len;
+ start = 0;
+ }
+
+ npages += cur_npages;
+ }
+
+ kvfree(pages);
+ ctx->bv = bv;
+ ctx->len = saved_len - count;
+ ctx->npages = npages;
+ iov_iter_bvec(&ctx->iter, ITER_BVEC | rw, ctx->bv, npages, ctx->len);
+ return 0;
+}
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index abae6dd2c6b9..cc88f4f0325e 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -980,10 +980,10 @@ struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time, int offset)
cifs_dbg(VFS, "illegal hours %d\n", st->Hours);
days = sd->Day;
month = sd->Month;
- if ((days > 31) || (month > 12)) {
+ if (days < 1 || days > 31 || month < 1 || month > 12) {
cifs_dbg(VFS, "illegal date, month %d day: %d\n", month, days);
- if (month > 12)
- month = 12;
+ days = clamp(days, 1, 31);
+ month = clamp(month, 1, 12);
}
month -= 1;
days += total_days_of_prev_months[month];
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index cc93ba4da9b5..27bc360c7ffd 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -1015,6 +1015,15 @@ cifs_dir_needs_close(struct cifsFileInfo *cfile)
return !cfile->srch_inf.endOfSearch && !cfile->invalidHandle;
}
+static bool
+cifs_can_echo(struct TCP_Server_Info *server)
+{
+ if (server->tcpStatus == CifsGood)
+ return true;
+
+ return false;
+}
+
struct smb_version_operations smb1_operations = {
.send_cancel = send_nt_cancel,
.compare_fids = cifs_compare_fids,
@@ -1049,6 +1058,7 @@ struct smb_version_operations smb1_operations = {
.get_dfs_refer = CIFSGetDFSRefer,
.qfs_tcon = cifs_qfs_tcon,
.is_path_accessible = cifs_is_path_accessible,
+ .can_echo = cifs_can_echo,
.query_path_info = cifs_query_path_info,
.query_file_info = cifs_query_file_info,
.get_srv_inum = cifs_get_srv_inum,
diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c
index 1a04b3a5beb1..7b08a1446a7f 100644
--- a/fs/cifs/smb2misc.c
+++ b/fs/cifs/smb2misc.c
@@ -499,7 +499,7 @@ smb2_tcon_has_lease(struct cifs_tcon *tcon, struct smb2_lease_break *rsp,
else
cfile->oplock_break_cancelled = true;
- queue_work(cifsiod_wq, &cfile->oplock_break);
+ queue_work(cifsoplockd_wq, &cfile->oplock_break);
kfree(lw);
return true;
}
@@ -643,7 +643,8 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server)
CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2,
&cinode->flags);
spin_unlock(&cfile->file_info_lock);
- queue_work(cifsiod_wq, &cfile->oplock_break);
+ queue_work(cifsoplockd_wq,
+ &cfile->oplock_break);
spin_unlock(&tcon->open_file_lock);
spin_unlock(&cifs_tcp_ses_lock);
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 7b12a727947e..c58691834eb2 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -942,6 +942,7 @@ smb3_enum_snapshots(const unsigned int xid, struct cifs_tcon *tcon,
}
if (snapshot_in.snapshot_array_size < sizeof(struct smb_snapshot_array)) {
rc = -ERANGE;
+ kfree(retbuf);
return rc;
}
@@ -2195,7 +2196,7 @@ receive_encrypted_read(struct TCP_Server_Info *server, struct mid_q_entry **mid)
if (rc)
goto free_pages;
- rc = cifs_discard_remaining_data(server, buf);
+ rc = cifs_discard_remaining_data(server);
if (rc)
goto free_pages;
@@ -2221,7 +2222,7 @@ free_pages:
kfree(pages);
return rc;
discard_data:
- cifs_discard_remaining_data(server, buf);
+ cifs_discard_remaining_data(server);
goto free_pages;
}
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 66fa1b941cdf..48ff7703b919 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -33,6 +33,7 @@
#include <linux/vfs.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/uaccess.h>
+#include <linux/uuid.h>
#include <linux/pagemap.h>
#include <linux/xattr.h>
#include "smb2pdu.h"
@@ -562,8 +563,10 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
* but for time being this is our only auth choice so doesn't matter.
* We just found a server which sets blob length to zero expecting raw.
*/
- if (blob_length == 0)
+ if (blob_length == 0) {
cifs_dbg(FYI, "missing security blob on negprot\n");
+ server->sec_ntlmssp = true;
+ }
rc = cifs_enable_signing(server, ses->sign);
if (rc)
@@ -630,8 +633,12 @@ int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon)
}
if (rsplen != sizeof(struct validate_negotiate_info_rsp)) {
- cifs_dbg(VFS, "invalid size of protocol negotiate response\n");
- return -EIO;
+ cifs_dbg(VFS, "invalid protocol negotiate response size: %d\n",
+ rsplen);
+
+ /* relax check since Mac returns max bufsize allowed on ioctl */
+ if (rsplen > CIFSMaxBufSize)
+ return -EIO;
}
/* check validate negotiate info response matches what we got earlier */
@@ -1171,9 +1178,6 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree,
else
return -EIO;
- if (tcon && tcon->bad_network_name)
- return -ENOENT;
-
unc_path = kmalloc(MAX_SHARENAME_LENGTH * 2, GFP_KERNEL);
if (unc_path == NULL)
return -ENOMEM;
@@ -1277,8 +1281,6 @@ tcon_exit:
tcon_error_exit:
if (rsp->hdr.sync_hdr.Status == STATUS_BAD_NETWORK_NAME) {
cifs_dbg(VFS, "BAD_NETWORK_NAME: %s\n", tree);
- if (tcon)
- tcon->bad_network_name = true;
}
goto tcon_exit;
}
@@ -1856,8 +1858,12 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
* than one credit. Windows typically sets this smaller, but for some
* ioctls it may be useful to allow server to send more. No point
* limiting what the server can send as long as fits in one credit
+ * Unfortunately - we can not handle more than CIFS_MAX_MSG_SIZE
+ * (by default, note that it can be overridden to make max larger)
+ * in responses (except for read responses which can be bigger.
+ * We may want to bump this limit up
*/
- req->MaxOutputResponse = cpu_to_le32(0xFF00); /* < 64K uses 1 credit */
+ req->MaxOutputResponse = cpu_to_le32(CIFSMaxBufSize);
if (is_fsctl)
req->Flags = cpu_to_le32(SMB2_0_IOCTL_IS_FSCTL);
@@ -2181,6 +2187,9 @@ void smb2_reconnect_server(struct work_struct *work)
struct cifs_tcon *tcon, *tcon2;
struct list_head tmp_list;
int tcon_exist = false;
+ int rc;
+ int resched = false;
+
/* Prevent simultaneous reconnects that can corrupt tcon->rlist list */
mutex_lock(&server->reconnect_mutex);
@@ -2208,13 +2217,18 @@ void smb2_reconnect_server(struct work_struct *work)
spin_unlock(&cifs_tcp_ses_lock);
list_for_each_entry_safe(tcon, tcon2, &tmp_list, rlist) {
- if (!smb2_reconnect(SMB2_INTERNAL_CMD, tcon))
+ rc = smb2_reconnect(SMB2_INTERNAL_CMD, tcon);
+ if (!rc)
cifs_reopen_persistent_handles(tcon);
+ else
+ resched = true;
list_del_init(&tcon->rlist);
cifs_put_tcon(tcon);
}
cifs_dbg(FYI, "Reconnecting tcons finished\n");
+ if (resched)
+ queue_delayed_work(cifsiod_wq, &server->reconnect, 2 * HZ);
mutex_unlock(&server->reconnect_mutex);
/* now we can safely release srv struct */
diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c
index 506b67fc93d9..c69ec96e92ac 100644
--- a/fs/cifs/smb2transport.c
+++ b/fs/cifs/smb2transport.c
@@ -538,23 +538,19 @@ smb2_mid_entry_alloc(const struct smb2_sync_hdr *shdr,
}
temp = mempool_alloc(cifs_mid_poolp, GFP_NOFS);
- if (temp == NULL)
- return temp;
- else {
- memset(temp, 0, sizeof(struct mid_q_entry));
- temp->mid = le64_to_cpu(shdr->MessageId);
- temp->pid = current->pid;
- temp->command = shdr->Command; /* Always LE */
- temp->when_alloc = jiffies;
- temp->server = server;
-
- /*
- * The default is for the mid to be synchronous, so the
- * default callback just wakes up the current task.
- */
- temp->callback = cifs_wake_up_task;
- temp->callback_data = current;
- }
+ memset(temp, 0, sizeof(struct mid_q_entry));
+ temp->mid = le64_to_cpu(shdr->MessageId);
+ temp->pid = current->pid;
+ temp->command = shdr->Command; /* Always LE */
+ temp->when_alloc = jiffies;
+ temp->server = server;
+
+ /*
+ * The default is for the mid to be synchronous, so the
+ * default callback just wakes up the current task.
+ */
+ temp->callback = cifs_wake_up_task;
+ temp->callback_data = current;
atomic_inc(&midCount);
temp->mid_state = MID_REQUEST_ALLOCATED;
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index f6e13a977fc8..4d64b5b8fc9c 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -55,26 +55,22 @@ AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server)
}
temp = mempool_alloc(cifs_mid_poolp, GFP_NOFS);
- if (temp == NULL)
- return temp;
- else {
- memset(temp, 0, sizeof(struct mid_q_entry));
- temp->mid = get_mid(smb_buffer);
- temp->pid = current->pid;
- temp->command = cpu_to_le16(smb_buffer->Command);
- cifs_dbg(FYI, "For smb_command %d\n", smb_buffer->Command);
+ memset(temp, 0, sizeof(struct mid_q_entry));
+ temp->mid = get_mid(smb_buffer);
+ temp->pid = current->pid;
+ temp->command = cpu_to_le16(smb_buffer->Command);
+ cifs_dbg(FYI, "For smb_command %d\n", smb_buffer->Command);
/* do_gettimeofday(&temp->when_sent);*/ /* easier to use jiffies */
- /* when mid allocated can be before when sent */
- temp->when_alloc = jiffies;
- temp->server = server;
+ /* when mid allocated can be before when sent */
+ temp->when_alloc = jiffies;
+ temp->server = server;
- /*
- * The default is for the mid to be synchronous, so the
- * default callback just wakes up the current task.
- */
- temp->callback = cifs_wake_up_task;
- temp->callback_data = current;
- }
+ /*
+ * The default is for the mid to be synchronous, so the
+ * default callback just wakes up the current task.
+ */
+ temp->callback = cifs_wake_up_task;
+ temp->callback_data = current;
atomic_inc(&midCount);
temp->mid_state = MID_REQUEST_ALLOCATED;
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 2dea594da199..6058df380cc0 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -183,10 +183,6 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
goto unlock_out;
}
- error = bdi_setup_and_register(&vc->bdi, "coda");
- if (error)
- goto unlock_out;
-
vc->vc_sb = sb;
mutex_unlock(&vc->vc_mutex);
@@ -197,7 +193,10 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
sb->s_magic = CODA_SUPER_MAGIC;
sb->s_op = &coda_super_operations;
sb->s_d_op = &coda_dentry_operations;
- sb->s_bdi = &vc->bdi;
+
+ error = super_setup_bdi(sb);
+ if (error)
+ goto error;
/* get root fid from Venus: this needs the root inode */
error = venus_rootfid(sb, &fid);
@@ -228,7 +227,6 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
error:
mutex_lock(&vc->vc_mutex);
- bdi_destroy(&vc->bdi);
vc->vc_sb = NULL;
sb->s_fs_info = NULL;
unlock_out:
@@ -240,7 +238,6 @@ static void coda_put_super(struct super_block *sb)
{
struct venus_comm *vcp = coda_vcp(sb);
mutex_lock(&vcp->vc_mutex);
- bdi_destroy(&vcp->bdi);
vcp->vc_sb = NULL;
sb->s_fs_info = NULL;
mutex_unlock(&vcp->vc_mutex);
diff --git a/fs/compat.c b/fs/compat.c
index c61b506f5bc9..190b38b39d9e 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -15,555 +15,14 @@
* published by the Free Software Foundation.
*/
-#include <linux/stddef.h>
-#include <linux/kernel.h>
-#include <linux/linkage.h>
#include <linux/compat.h>
-#include <linux/errno.h>
-#include <linux/time.h>
-#include <linux/cred.h>
-#include <linux/fs.h>
-#include <linux/fcntl.h>
-#include <linux/namei.h>
-#include <linux/file.h>
-#include <linux/fdtable.h>
-#include <linux/vfs.h>
-#include <linux/ioctl.h>
-#include <linux/init.h>
#include <linux/ncp_mount.h>
#include <linux/nfs4_mount.h>
#include <linux/syscalls.h>
-#include <linux/ctype.h>
-#include <linux/dirent.h>
-#include <linux/fsnotify.h>
-#include <linux/highuid.h>
-#include <linux/personality.h>
-#include <linux/rwsem.h>
-#include <linux/tsacct_kern.h>
-#include <linux/security.h>
-#include <linux/highmem.h>
-#include <linux/signal.h>
-#include <linux/poll.h>
-#include <linux/mm.h>
-#include <linux/fs_struct.h>
#include <linux/slab.h>
-#include <linux/pagemap.h>
-#include <linux/aio.h>
-
#include <linux/uaccess.h>
-#include <asm/mmu_context.h>
-#include <asm/ioctls.h>
#include "internal.h"
-/*
- * Not all architectures have sys_utime, so implement this in terms
- * of sys_utimes.
- */
-COMPAT_SYSCALL_DEFINE2(utime, const char __user *, filename,
- struct compat_utimbuf __user *, t)
-{
- struct timespec tv[2];
-
- if (t) {
- if (get_user(tv[0].tv_sec, &t->actime) ||
- get_user(tv[1].tv_sec, &t->modtime))
- return -EFAULT;
- tv[0].tv_nsec = 0;
- tv[1].tv_nsec = 0;
- }
- return do_utimes(AT_FDCWD, filename, t ? tv : NULL, 0);
-}
-
-COMPAT_SYSCALL_DEFINE4(utimensat, unsigned int, dfd, const char __user *, filename, struct compat_timespec __user *, t, int, flags)
-{
- struct timespec tv[2];
-
- if (t) {
- if (compat_get_timespec(&tv[0], &t[0]) ||
- compat_get_timespec(&tv[1], &t[1]))
- return -EFAULT;
-
- if (tv[0].tv_nsec == UTIME_OMIT && tv[1].tv_nsec == UTIME_OMIT)
- return 0;
- }
- return do_utimes(dfd, filename, t ? tv : NULL, flags);
-}
-
-COMPAT_SYSCALL_DEFINE3(futimesat, unsigned int, dfd, const char __user *, filename, struct compat_timeval __user *, t)
-{
- struct timespec tv[2];
-
- if (t) {
- if (get_user(tv[0].tv_sec, &t[0].tv_sec) ||
- get_user(tv[0].tv_nsec, &t[0].tv_usec) ||
- get_user(tv[1].tv_sec, &t[1].tv_sec) ||
- get_user(tv[1].tv_nsec, &t[1].tv_usec))
- return -EFAULT;
- if (tv[0].tv_nsec >= 1000000 || tv[0].tv_nsec < 0 ||
- tv[1].tv_nsec >= 1000000 || tv[1].tv_nsec < 0)
- return -EINVAL;
- tv[0].tv_nsec *= 1000;
- tv[1].tv_nsec *= 1000;
- }
- return do_utimes(dfd, filename, t ? tv : NULL, 0);
-}
-
-COMPAT_SYSCALL_DEFINE2(utimes, const char __user *, filename, struct compat_timeval __user *, t)
-{
- return compat_sys_futimesat(AT_FDCWD, filename, t);
-}
-
-static int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf)
-{
- struct compat_stat tmp;
-
- if (!old_valid_dev(stat->dev) || !old_valid_dev(stat->rdev))
- return -EOVERFLOW;
-
- memset(&tmp, 0, sizeof(tmp));
- tmp.st_dev = old_encode_dev(stat->dev);
- tmp.st_ino = stat->ino;
- if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino)
- return -EOVERFLOW;
- tmp.st_mode = stat->mode;
- tmp.st_nlink = stat->nlink;
- if (tmp.st_nlink != stat->nlink)
- return -EOVERFLOW;
- SET_UID(tmp.st_uid, from_kuid_munged(current_user_ns(), stat->uid));
- SET_GID(tmp.st_gid, from_kgid_munged(current_user_ns(), stat->gid));
- tmp.st_rdev = old_encode_dev(stat->rdev);
- if ((u64) stat->size > MAX_NON_LFS)
- return -EOVERFLOW;
- tmp.st_size = stat->size;
- tmp.st_atime = stat->atime.tv_sec;
- tmp.st_atime_nsec = stat->atime.tv_nsec;
- tmp.st_mtime = stat->mtime.tv_sec;
- tmp.st_mtime_nsec = stat->mtime.tv_nsec;
- tmp.st_ctime = stat->ctime.tv_sec;
- tmp.st_ctime_nsec = stat->ctime.tv_nsec;
- tmp.st_blocks = stat->blocks;
- tmp.st_blksize = stat->blksize;
- return copy_to_user(ubuf, &tmp, sizeof(tmp)) ? -EFAULT : 0;
-}
-
-COMPAT_SYSCALL_DEFINE2(newstat, const char __user *, filename,
- struct compat_stat __user *, statbuf)
-{
- struct kstat stat;
- int error;
-
- error = vfs_stat(filename, &stat);
- if (error)
- return error;
- return cp_compat_stat(&stat, statbuf);
-}
-
-COMPAT_SYSCALL_DEFINE2(newlstat, const char __user *, filename,
- struct compat_stat __user *, statbuf)
-{
- struct kstat stat;
- int error;
-
- error = vfs_lstat(filename, &stat);
- if (error)
- return error;
- return cp_compat_stat(&stat, statbuf);
-}
-
-#ifndef __ARCH_WANT_STAT64
-COMPAT_SYSCALL_DEFINE4(newfstatat, unsigned int, dfd,
- const char __user *, filename,
- struct compat_stat __user *, statbuf, int, flag)
-{
- struct kstat stat;
- int error;
-
- error = vfs_fstatat(dfd, filename, &stat, flag);
- if (error)
- return error;
- return cp_compat_stat(&stat, statbuf);
-}
-#endif
-
-COMPAT_SYSCALL_DEFINE2(newfstat, unsigned int, fd,
- struct compat_stat __user *, statbuf)
-{
- struct kstat stat;
- int error = vfs_fstat(fd, &stat);
-
- if (!error)
- error = cp_compat_stat(&stat, statbuf);
- return error;
-}
-
-static int put_compat_statfs(struct compat_statfs __user *ubuf, struct kstatfs *kbuf)
-{
-
- if (sizeof ubuf->f_blocks == 4) {
- if ((kbuf->f_blocks | kbuf->f_bfree | kbuf->f_bavail |
- kbuf->f_bsize | kbuf->f_frsize) & 0xffffffff00000000ULL)
- return -EOVERFLOW;
- /* f_files and f_ffree may be -1; it's okay
- * to stuff that into 32 bits */
- if (kbuf->f_files != 0xffffffffffffffffULL
- && (kbuf->f_files & 0xffffffff00000000ULL))
- return -EOVERFLOW;
- if (kbuf->f_ffree != 0xffffffffffffffffULL
- && (kbuf->f_ffree & 0xffffffff00000000ULL))
- return -EOVERFLOW;
- }
- if (!access_ok(VERIFY_WRITE, ubuf, sizeof(*ubuf)) ||
- __put_user(kbuf->f_type, &ubuf->f_type) ||
- __put_user(kbuf->f_bsize, &ubuf->f_bsize) ||
- __put_user(kbuf->f_blocks, &ubuf->f_blocks) ||
- __put_user(kbuf->f_bfree, &ubuf->f_bfree) ||
- __put_user(kbuf->f_bavail, &ubuf->f_bavail) ||
- __put_user(kbuf->f_files, &ubuf->f_files) ||
- __put_user(kbuf->f_ffree, &ubuf->f_ffree) ||
- __put_user(kbuf->f_namelen, &ubuf->f_namelen) ||
- __put_user(kbuf->f_fsid.val[0], &ubuf->f_fsid.val[0]) ||
- __put_user(kbuf->f_fsid.val[1], &ubuf->f_fsid.val[1]) ||
- __put_user(kbuf->f_frsize, &ubuf->f_frsize) ||
- __put_user(kbuf->f_flags, &ubuf->f_flags) ||
- __clear_user(ubuf->f_spare, sizeof(ubuf->f_spare)))
- return -EFAULT;
- return 0;
-}
-
-/*
- * The following statfs calls are copies of code from fs/statfs.c and
- * should be checked against those from time to time
- */
-COMPAT_SYSCALL_DEFINE2(statfs, const char __user *, pathname, struct compat_statfs __user *, buf)
-{
- struct kstatfs tmp;
- int error = user_statfs(pathname, &tmp);
- if (!error)
- error = put_compat_statfs(buf, &tmp);
- return error;
-}
-
-COMPAT_SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct compat_statfs __user *, buf)
-{
- struct kstatfs tmp;
- int error = fd_statfs(fd, &tmp);
- if (!error)
- error = put_compat_statfs(buf, &tmp);
- return error;
-}
-
-static int put_compat_statfs64(struct compat_statfs64 __user *ubuf, struct kstatfs *kbuf)
-{
- if (sizeof(ubuf->f_bsize) == 4) {
- if ((kbuf->f_type | kbuf->f_bsize | kbuf->f_namelen |
- kbuf->f_frsize | kbuf->f_flags) & 0xffffffff00000000ULL)
- return -EOVERFLOW;
- /* f_files and f_ffree may be -1; it's okay
- * to stuff that into 32 bits */
- if (kbuf->f_files != 0xffffffffffffffffULL
- && (kbuf->f_files & 0xffffffff00000000ULL))
- return -EOVERFLOW;
- if (kbuf->f_ffree != 0xffffffffffffffffULL
- && (kbuf->f_ffree & 0xffffffff00000000ULL))
- return -EOVERFLOW;
- }
- if (!access_ok(VERIFY_WRITE, ubuf, sizeof(*ubuf)) ||
- __put_user(kbuf->f_type, &ubuf->f_type) ||
- __put_user(kbuf->f_bsize, &ubuf->f_bsize) ||
- __put_user(kbuf->f_blocks, &ubuf->f_blocks) ||
- __put_user(kbuf->f_bfree, &ubuf->f_bfree) ||
- __put_user(kbuf->f_bavail, &ubuf->f_bavail) ||
- __put_user(kbuf->f_files, &ubuf->f_files) ||
- __put_user(kbuf->f_ffree, &ubuf->f_ffree) ||
- __put_user(kbuf->f_namelen, &ubuf->f_namelen) ||
- __put_user(kbuf->f_fsid.val[0], &ubuf->f_fsid.val[0]) ||
- __put_user(kbuf->f_fsid.val[1], &ubuf->f_fsid.val[1]) ||
- __put_user(kbuf->f_frsize, &ubuf->f_frsize) ||
- __put_user(kbuf->f_flags, &ubuf->f_flags) ||
- __clear_user(ubuf->f_spare, sizeof(ubuf->f_spare)))
- return -EFAULT;
- return 0;
-}
-
-COMPAT_SYSCALL_DEFINE3(statfs64, const char __user *, pathname, compat_size_t, sz, struct compat_statfs64 __user *, buf)
-{
- struct kstatfs tmp;
- int error;
-
- if (sz != sizeof(*buf))
- return -EINVAL;
-
- error = user_statfs(pathname, &tmp);
- if (!error)
- error = put_compat_statfs64(buf, &tmp);
- return error;
-}
-
-COMPAT_SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, compat_size_t, sz, struct compat_statfs64 __user *, buf)
-{
- struct kstatfs tmp;
- int error;
-
- if (sz != sizeof(*buf))
- return -EINVAL;
-
- error = fd_statfs(fd, &tmp);
- if (!error)
- error = put_compat_statfs64(buf, &tmp);
- return error;
-}
-
-/*
- * This is a copy of sys_ustat, just dealing with a structure layout.
- * Given how simple this syscall is that apporach is more maintainable
- * than the various conversion hacks.
- */
-COMPAT_SYSCALL_DEFINE2(ustat, unsigned, dev, struct compat_ustat __user *, u)
-{
- struct compat_ustat tmp;
- struct kstatfs sbuf;
- int err = vfs_ustat(new_decode_dev(dev), &sbuf);
- if (err)
- return err;
-
- memset(&tmp, 0, sizeof(struct compat_ustat));
- tmp.f_tfree = sbuf.f_bfree;
- tmp.f_tinode = sbuf.f_ffree;
- if (copy_to_user(u, &tmp, sizeof(struct compat_ustat)))
- return -EFAULT;
- return 0;
-}
-
-static int get_compat_flock(struct flock *kfl, struct compat_flock __user *ufl)
-{
- if (!access_ok(VERIFY_READ, ufl, sizeof(*ufl)) ||
- __get_user(kfl->l_type, &ufl->l_type) ||
- __get_user(kfl->l_whence, &ufl->l_whence) ||
- __get_user(kfl->l_start, &ufl->l_start) ||
- __get_user(kfl->l_len, &ufl->l_len) ||
- __get_user(kfl->l_pid, &ufl->l_pid))
- return -EFAULT;
- return 0;
-}
-
-static int put_compat_flock(struct flock *kfl, struct compat_flock __user *ufl)
-{
- if (!access_ok(VERIFY_WRITE, ufl, sizeof(*ufl)) ||
- __put_user(kfl->l_type, &ufl->l_type) ||
- __put_user(kfl->l_whence, &ufl->l_whence) ||
- __put_user(kfl->l_start, &ufl->l_start) ||
- __put_user(kfl->l_len, &ufl->l_len) ||
- __put_user(kfl->l_pid, &ufl->l_pid))
- return -EFAULT;
- return 0;
-}
-
-#ifndef HAVE_ARCH_GET_COMPAT_FLOCK64
-static int get_compat_flock64(struct flock *kfl, struct compat_flock64 __user *ufl)
-{
- if (!access_ok(VERIFY_READ, ufl, sizeof(*ufl)) ||
- __get_user(kfl->l_type, &ufl->l_type) ||
- __get_user(kfl->l_whence, &ufl->l_whence) ||
- __get_user(kfl->l_start, &ufl->l_start) ||
- __get_user(kfl->l_len, &ufl->l_len) ||
- __get_user(kfl->l_pid, &ufl->l_pid))
- return -EFAULT;
- return 0;
-}
-#endif
-
-#ifndef HAVE_ARCH_PUT_COMPAT_FLOCK64
-static int put_compat_flock64(struct flock *kfl, struct compat_flock64 __user *ufl)
-{
- if (!access_ok(VERIFY_WRITE, ufl, sizeof(*ufl)) ||
- __put_user(kfl->l_type, &ufl->l_type) ||
- __put_user(kfl->l_whence, &ufl->l_whence) ||
- __put_user(kfl->l_start, &ufl->l_start) ||
- __put_user(kfl->l_len, &ufl->l_len) ||
- __put_user(kfl->l_pid, &ufl->l_pid))
- return -EFAULT;
- return 0;
-}
-#endif
-
-static unsigned int
-convert_fcntl_cmd(unsigned int cmd)
-{
- switch (cmd) {
- case F_GETLK64:
- return F_GETLK;
- case F_SETLK64:
- return F_SETLK;
- case F_SETLKW64:
- return F_SETLKW;
- }
-
- return cmd;
-}
-
-COMPAT_SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd,
- compat_ulong_t, arg)
-{
- mm_segment_t old_fs;
- struct flock f;
- long ret;
- unsigned int conv_cmd;
-
- switch (cmd) {
- case F_GETLK:
- case F_SETLK:
- case F_SETLKW:
- ret = get_compat_flock(&f, compat_ptr(arg));
- if (ret != 0)
- break;
- old_fs = get_fs();
- set_fs(KERNEL_DS);
- ret = sys_fcntl(fd, cmd, (unsigned long)&f);
- set_fs(old_fs);
- if (cmd == F_GETLK && ret == 0) {
- /* GETLK was successful and we need to return the data...
- * but it needs to fit in the compat structure.
- * l_start shouldn't be too big, unless the original
- * start + end is greater than COMPAT_OFF_T_MAX, in which
- * case the app was asking for trouble, so we return
- * -EOVERFLOW in that case.
- * l_len could be too big, in which case we just truncate it,
- * and only allow the app to see that part of the conflicting
- * lock that might make sense to it anyway
- */
-
- if (f.l_start > COMPAT_OFF_T_MAX)
- ret = -EOVERFLOW;
- if (f.l_len > COMPAT_OFF_T_MAX)
- f.l_len = COMPAT_OFF_T_MAX;
- if (ret == 0)
- ret = put_compat_flock(&f, compat_ptr(arg));
- }
- break;
-
- case F_GETLK64:
- case F_SETLK64:
- case F_SETLKW64:
- case F_OFD_GETLK:
- case F_OFD_SETLK:
- case F_OFD_SETLKW:
- ret = get_compat_flock64(&f, compat_ptr(arg));
- if (ret != 0)
- break;
- old_fs = get_fs();
- set_fs(KERNEL_DS);
- conv_cmd = convert_fcntl_cmd(cmd);
- ret = sys_fcntl(fd, conv_cmd, (unsigned long)&f);
- set_fs(old_fs);
- if ((conv_cmd == F_GETLK || conv_cmd == F_OFD_GETLK) && ret == 0) {
- /* need to return lock information - see above for commentary */
- if (f.l_start > COMPAT_LOFF_T_MAX)
- ret = -EOVERFLOW;
- if (f.l_len > COMPAT_LOFF_T_MAX)
- f.l_len = COMPAT_LOFF_T_MAX;
- if (ret == 0)
- ret = put_compat_flock64(&f, compat_ptr(arg));
- }
- break;
-
- default:
- ret = sys_fcntl(fd, cmd, arg);
- break;
- }
- return ret;
-}
-
-COMPAT_SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd,
- compat_ulong_t, arg)
-{
- switch (cmd) {
- case F_GETLK64:
- case F_SETLK64:
- case F_SETLKW64:
- case F_OFD_GETLK:
- case F_OFD_SETLK:
- case F_OFD_SETLKW:
- return -EINVAL;
- }
- return compat_sys_fcntl64(fd, cmd, arg);
-}
-
-/* A write operation does a read from user space and vice versa */
-#define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ)
-
-ssize_t compat_rw_copy_check_uvector(int type,
- const struct compat_iovec __user *uvector, unsigned long nr_segs,
- unsigned long fast_segs, struct iovec *fast_pointer,
- struct iovec **ret_pointer)
-{
- compat_ssize_t tot_len;
- struct iovec *iov = *ret_pointer = fast_pointer;
- ssize_t ret = 0;
- int seg;
-
- /*
- * SuS says "The readv() function *may* fail if the iovcnt argument
- * was less than or equal to 0, or greater than {IOV_MAX}. Linux has
- * traditionally returned zero for zero segments, so...
- */
- if (nr_segs == 0)
- goto out;
-
- ret = -EINVAL;
- if (nr_segs > UIO_MAXIOV)
- goto out;
- if (nr_segs > fast_segs) {
- ret = -ENOMEM;
- iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
- if (iov == NULL)
- goto out;
- }
- *ret_pointer = iov;
-
- ret = -EFAULT;
- if (!access_ok(VERIFY_READ, uvector, nr_segs*sizeof(*uvector)))
- goto out;
-
- /*
- * Single unix specification:
- * We should -EINVAL if an element length is not >= 0 and fitting an
- * ssize_t.
- *
- * In Linux, the total length is limited to MAX_RW_COUNT, there is
- * no overflow possibility.
- */
- tot_len = 0;
- ret = -EINVAL;
- for (seg = 0; seg < nr_segs; seg++) {
- compat_uptr_t buf;
- compat_ssize_t len;
-
- if (__get_user(len, &uvector->iov_len) ||
- __get_user(buf, &uvector->iov_base)) {
- ret = -EFAULT;
- goto out;
- }
- if (len < 0) /* size_t not fitting in compat_ssize_t .. */
- goto out;
- if (type >= 0 &&
- !access_ok(vrfy_dir(type), compat_ptr(buf), len)) {
- ret = -EFAULT;
- goto out;
- }
- if (len > MAX_RW_COUNT - tot_len)
- len = MAX_RW_COUNT - tot_len;
- tot_len += len;
- iov->iov_base = compat_ptr(buf);
- iov->iov_len = (compat_size_t) len;
- uvector++;
- iov++;
- }
- ret = tot_len;
-
-out:
- return ret;
-}
-
struct compat_ncp_mount_data {
compat_int_t version;
compat_uint_t ncp_fd;
@@ -744,653 +203,3 @@ COMPAT_SYSCALL_DEFINE5(mount, const char __user *, dev_name,
out:
return retval;
}
-
-struct compat_old_linux_dirent {
- compat_ulong_t d_ino;
- compat_ulong_t d_offset;
- unsigned short d_namlen;
- char d_name[1];
-};
-
-struct compat_readdir_callback {
- struct dir_context ctx;
- struct compat_old_linux_dirent __user *dirent;
- int result;
-};
-
-static int compat_fillonedir(struct dir_context *ctx, const char *name,
- int namlen, loff_t offset, u64 ino,
- unsigned int d_type)
-{
- struct compat_readdir_callback *buf =
- container_of(ctx, struct compat_readdir_callback, ctx);
- struct compat_old_linux_dirent __user *dirent;
- compat_ulong_t d_ino;
-
- if (buf->result)
- return -EINVAL;
- d_ino = ino;
- if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) {
- buf->result = -EOVERFLOW;
- return -EOVERFLOW;
- }
- buf->result++;
- dirent = buf->dirent;
- if (!access_ok(VERIFY_WRITE, dirent,
- (unsigned long)(dirent->d_name + namlen + 1) -
- (unsigned long)dirent))
- goto efault;
- if ( __put_user(d_ino, &dirent->d_ino) ||
- __put_user(offset, &dirent->d_offset) ||
- __put_user(namlen, &dirent->d_namlen) ||
- __copy_to_user(dirent->d_name, name, namlen) ||
- __put_user(0, dirent->d_name + namlen))
- goto efault;
- return 0;
-efault:
- buf->result = -EFAULT;
- return -EFAULT;
-}
-
-COMPAT_SYSCALL_DEFINE3(old_readdir, unsigned int, fd,
- struct compat_old_linux_dirent __user *, dirent, unsigned int, count)
-{
- int error;
- struct fd f = fdget_pos(fd);
- struct compat_readdir_callback buf = {
- .ctx.actor = compat_fillonedir,
- .dirent = dirent
- };
-
- if (!f.file)
- return -EBADF;
-
- error = iterate_dir(f.file, &buf.ctx);
- if (buf.result)
- error = buf.result;
-
- fdput_pos(f);
- return error;
-}
-
-struct compat_linux_dirent {
- compat_ulong_t d_ino;
- compat_ulong_t d_off;
- unsigned short d_reclen;
- char d_name[1];
-};
-
-struct compat_getdents_callback {
- struct dir_context ctx;
- struct compat_linux_dirent __user *current_dir;
- struct compat_linux_dirent __user *previous;
- int count;
- int error;
-};
-
-static int compat_filldir(struct dir_context *ctx, const char *name, int namlen,
- loff_t offset, u64 ino, unsigned int d_type)
-{
- struct compat_linux_dirent __user * dirent;
- struct compat_getdents_callback *buf =
- container_of(ctx, struct compat_getdents_callback, ctx);
- compat_ulong_t d_ino;
- int reclen = ALIGN(offsetof(struct compat_linux_dirent, d_name) +
- namlen + 2, sizeof(compat_long_t));
-
- buf->error = -EINVAL; /* only used if we fail.. */
- if (reclen > buf->count)
- return -EINVAL;
- d_ino = ino;
- if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) {
- buf->error = -EOVERFLOW;
- return -EOVERFLOW;
- }
- dirent = buf->previous;
- if (dirent) {
- if (signal_pending(current))
- return -EINTR;
- if (__put_user(offset, &dirent->d_off))
- goto efault;
- }
- dirent = buf->current_dir;
- if (__put_user(d_ino, &dirent->d_ino))
- goto efault;
- if (__put_user(reclen, &dirent->d_reclen))
- goto efault;
- if (copy_to_user(dirent->d_name, name, namlen))
- goto efault;
- if (__put_user(0, dirent->d_name + namlen))
- goto efault;
- if (__put_user(d_type, (char __user *) dirent + reclen - 1))
- goto efault;
- buf->previous = dirent;
- dirent = (void __user *)dirent + reclen;
- buf->current_dir = dirent;
- buf->count -= reclen;
- return 0;
-efault:
- buf->error = -EFAULT;
- return -EFAULT;
-}
-
-COMPAT_SYSCALL_DEFINE3(getdents, unsigned int, fd,
- struct compat_linux_dirent __user *, dirent, unsigned int, count)
-{
- struct fd f;
- struct compat_linux_dirent __user * lastdirent;
- struct compat_getdents_callback buf = {
- .ctx.actor = compat_filldir,
- .current_dir = dirent,
- .count = count
- };
- int error;
-
- if (!access_ok(VERIFY_WRITE, dirent, count))
- return -EFAULT;
-
- f = fdget_pos(fd);
- if (!f.file)
- return -EBADF;
-
- error = iterate_dir(f.file, &buf.ctx);
- if (error >= 0)
- error = buf.error;
- lastdirent = buf.previous;
- if (lastdirent) {
- if (put_user(buf.ctx.pos, &lastdirent->d_off))
- error = -EFAULT;
- else
- error = count - buf.count;
- }
- fdput_pos(f);
- return error;
-}
-
-#ifdef __ARCH_WANT_COMPAT_SYS_GETDENTS64
-
-struct compat_getdents_callback64 {
- struct dir_context ctx;
- struct linux_dirent64 __user *current_dir;
- struct linux_dirent64 __user *previous;
- int count;
- int error;
-};
-
-static int compat_filldir64(struct dir_context *ctx, const char *name,
- int namlen, loff_t offset, u64 ino,
- unsigned int d_type)
-{
- struct linux_dirent64 __user *dirent;
- struct compat_getdents_callback64 *buf =
- container_of(ctx, struct compat_getdents_callback64, ctx);
- int reclen = ALIGN(offsetof(struct linux_dirent64, d_name) + namlen + 1,
- sizeof(u64));
- u64 off;
-
- buf->error = -EINVAL; /* only used if we fail.. */
- if (reclen > buf->count)
- return -EINVAL;
- dirent = buf->previous;
-
- if (dirent) {
- if (signal_pending(current))
- return -EINTR;
- if (__put_user_unaligned(offset, &dirent->d_off))
- goto efault;
- }
- dirent = buf->current_dir;
- if (__put_user_unaligned(ino, &dirent->d_ino))
- goto efault;
- off = 0;
- if (__put_user_unaligned(off, &dirent->d_off))
- goto efault;
- if (__put_user(reclen, &dirent->d_reclen))
- goto efault;
- if (__put_user(d_type, &dirent->d_type))
- goto efault;
- if (copy_to_user(dirent->d_name, name, namlen))
- goto efault;
- if (__put_user(0, dirent->d_name + namlen))
- goto efault;
- buf->previous = dirent;
- dirent = (void __user *)dirent + reclen;
- buf->current_dir = dirent;
- buf->count -= reclen;
- return 0;
-efault:
- buf->error = -EFAULT;
- return -EFAULT;
-}
-
-COMPAT_SYSCALL_DEFINE3(getdents64, unsigned int, fd,
- struct linux_dirent64 __user *, dirent, unsigned int, count)
-{
- struct fd f;
- struct linux_dirent64 __user * lastdirent;
- struct compat_getdents_callback64 buf = {
- .ctx.actor = compat_filldir64,
- .current_dir = dirent,
- .count = count
- };
- int error;
-
- if (!access_ok(VERIFY_WRITE, dirent, count))
- return -EFAULT;
-
- f = fdget_pos(fd);
- if (!f.file)
- return -EBADF;
-
- error = iterate_dir(f.file, &buf.ctx);
- if (error >= 0)
- error = buf.error;
- lastdirent = buf.previous;
- if (lastdirent) {
- typeof(lastdirent->d_off) d_off = buf.ctx.pos;
- if (__put_user_unaligned(d_off, &lastdirent->d_off))
- error = -EFAULT;
- else
- error = count - buf.count;
- }
- fdput_pos(f);
- return error;
-}
-#endif /* __ARCH_WANT_COMPAT_SYS_GETDENTS64 */
-
-/*
- * Exactly like fs/open.c:sys_open(), except that it doesn't set the
- * O_LARGEFILE flag.
- */
-COMPAT_SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode)
-{
- return do_sys_open(AT_FDCWD, filename, flags, mode);
-}
-
-/*
- * Exactly like fs/open.c:sys_openat(), except that it doesn't set the
- * O_LARGEFILE flag.
- */
-COMPAT_SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags, umode_t, mode)
-{
- return do_sys_open(dfd, filename, flags, mode);
-}
-
-#define __COMPAT_NFDBITS (8 * sizeof(compat_ulong_t))
-
-static int poll_select_copy_remaining(struct timespec *end_time, void __user *p,
- int timeval, int ret)
-{
- struct timespec ts;
-
- if (!p)
- return ret;
-
- if (current->personality & STICKY_TIMEOUTS)
- goto sticky;
-
- /* No update for zero timeout */
- if (!end_time->tv_sec && !end_time->tv_nsec)
- return ret;
-
- ktime_get_ts(&ts);
- ts = timespec_sub(*end_time, ts);
- if (ts.tv_sec < 0)
- ts.tv_sec = ts.tv_nsec = 0;
-
- if (timeval) {
- struct compat_timeval rtv;
-
- rtv.tv_sec = ts.tv_sec;
- rtv.tv_usec = ts.tv_nsec / NSEC_PER_USEC;
-
- if (!copy_to_user(p, &rtv, sizeof(rtv)))
- return ret;
- } else {
- struct compat_timespec rts;
-
- rts.tv_sec = ts.tv_sec;
- rts.tv_nsec = ts.tv_nsec;
-
- if (!copy_to_user(p, &rts, sizeof(rts)))
- return ret;
- }
- /*
- * If an application puts its timeval in read-only memory, we
- * don't want the Linux-specific update to the timeval to
- * cause a fault after the select has completed
- * successfully. However, because we're not updating the
- * timeval, we can't restart the system call.
- */
-
-sticky:
- if (ret == -ERESTARTNOHAND)
- ret = -EINTR;
- return ret;
-}
-
-/*
- * Ooo, nasty. We need here to frob 32-bit unsigned longs to
- * 64-bit unsigned longs.
- */
-static
-int compat_get_fd_set(unsigned long nr, compat_ulong_t __user *ufdset,
- unsigned long *fdset)
-{
- nr = DIV_ROUND_UP(nr, __COMPAT_NFDBITS);
- if (ufdset) {
- unsigned long odd;
-
- if (!access_ok(VERIFY_WRITE, ufdset, nr*sizeof(compat_ulong_t)))
- return -EFAULT;
-
- odd = nr & 1UL;
- nr &= ~1UL;
- while (nr) {
- unsigned long h, l;
- if (__get_user(l, ufdset) || __get_user(h, ufdset+1))
- return -EFAULT;
- ufdset += 2;
- *fdset++ = h << 32 | l;
- nr -= 2;
- }
- if (odd && __get_user(*fdset, ufdset))
- return -EFAULT;
- } else {
- /* Tricky, must clear full unsigned long in the
- * kernel fdset at the end, this makes sure that
- * actually happens.
- */
- memset(fdset, 0, ((nr + 1) & ~1)*sizeof(compat_ulong_t));
- }
- return 0;
-}
-
-static
-int compat_set_fd_set(unsigned long nr, compat_ulong_t __user *ufdset,
- unsigned long *fdset)
-{
- unsigned long odd;
- nr = DIV_ROUND_UP(nr, __COMPAT_NFDBITS);
-
- if (!ufdset)
- return 0;
-
- odd = nr & 1UL;
- nr &= ~1UL;
- while (nr) {
- unsigned long h, l;
- l = *fdset++;
- h = l >> 32;
- if (__put_user(l, ufdset) || __put_user(h, ufdset+1))
- return -EFAULT;
- ufdset += 2;
- nr -= 2;
- }
- if (odd && __put_user(*fdset, ufdset))
- return -EFAULT;
- return 0;
-}
-
-
-/*
- * This is a virtual copy of sys_select from fs/select.c and probably
- * should be compared to it from time to time
- */
-
-/*
- * We can actually return ERESTARTSYS instead of EINTR, but I'd
- * like to be certain this leads to no problems. So I return
- * EINTR just for safety.
- *
- * Update: ERESTARTSYS breaks at least the xview clock binary, so
- * I'm trying ERESTARTNOHAND which restart only when you want to.
- */
-int compat_core_sys_select(int n, compat_ulong_t __user *inp,
- compat_ulong_t __user *outp, compat_ulong_t __user *exp,
- struct timespec *end_time)
-{
- fd_set_bits fds;
- void *bits;
- int size, max_fds, ret = -EINVAL;
- struct fdtable *fdt;
- long stack_fds[SELECT_STACK_ALLOC/sizeof(long)];
-
- if (n < 0)
- goto out_nofds;
-
- /* max_fds can increase, so grab it once to avoid race */
- rcu_read_lock();
- fdt = files_fdtable(current->files);
- max_fds = fdt->max_fds;
- rcu_read_unlock();
- if (n > max_fds)
- n = max_fds;
-
- /*
- * We need 6 bitmaps (in/out/ex for both incoming and outgoing),
- * since we used fdset we need to allocate memory in units of
- * long-words.
- */
- size = FDS_BYTES(n);
- bits = stack_fds;
- if (size > sizeof(stack_fds) / 6) {
- bits = kmalloc(6 * size, GFP_KERNEL);
- ret = -ENOMEM;
- if (!bits)
- goto out_nofds;
- }
- fds.in = (unsigned long *) bits;
- fds.out = (unsigned long *) (bits + size);
- fds.ex = (unsigned long *) (bits + 2*size);
- fds.res_in = (unsigned long *) (bits + 3*size);
- fds.res_out = (unsigned long *) (bits + 4*size);
- fds.res_ex = (unsigned long *) (bits + 5*size);
-
- if ((ret = compat_get_fd_set(n, inp, fds.in)) ||
- (ret = compat_get_fd_set(n, outp, fds.out)) ||
- (ret = compat_get_fd_set(n, exp, fds.ex)))
- goto out;
- zero_fd_set(n, fds.res_in);
- zero_fd_set(n, fds.res_out);
- zero_fd_set(n, fds.res_ex);
-
- ret = do_select(n, &fds, end_time);
-
- if (ret < 0)
- goto out;
- if (!ret) {
- ret = -ERESTARTNOHAND;
- if (signal_pending(current))
- goto out;
- ret = 0;
- }
-
- if (compat_set_fd_set(n, inp, fds.res_in) ||
- compat_set_fd_set(n, outp, fds.res_out) ||
- compat_set_fd_set(n, exp, fds.res_ex))
- ret = -EFAULT;
-out:
- if (bits != stack_fds)
- kfree(bits);
-out_nofds:
- return ret;
-}
-
-COMPAT_SYSCALL_DEFINE5(select, int, n, compat_ulong_t __user *, inp,
- compat_ulong_t __user *, outp, compat_ulong_t __user *, exp,
- struct compat_timeval __user *, tvp)
-{
- struct timespec end_time, *to = NULL;
- struct compat_timeval tv;
- int ret;
-
- if (tvp) {
- if (copy_from_user(&tv, tvp, sizeof(tv)))
- return -EFAULT;
-
- to = &end_time;
- if (poll_select_set_timeout(to,
- tv.tv_sec + (tv.tv_usec / USEC_PER_SEC),
- (tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC))
- return -EINVAL;
- }
-
- ret = compat_core_sys_select(n, inp, outp, exp, to);
- ret = poll_select_copy_remaining(&end_time, tvp, 1, ret);
-
- return ret;
-}
-
-struct compat_sel_arg_struct {
- compat_ulong_t n;
- compat_uptr_t inp;
- compat_uptr_t outp;
- compat_uptr_t exp;
- compat_uptr_t tvp;
-};
-
-COMPAT_SYSCALL_DEFINE1(old_select, struct compat_sel_arg_struct __user *, arg)
-{
- struct compat_sel_arg_struct a;
-
- if (copy_from_user(&a, arg, sizeof(a)))
- return -EFAULT;
- return compat_sys_select(a.n, compat_ptr(a.inp), compat_ptr(a.outp),
- compat_ptr(a.exp), compat_ptr(a.tvp));
-}
-
-static long do_compat_pselect(int n, compat_ulong_t __user *inp,
- compat_ulong_t __user *outp, compat_ulong_t __user *exp,
- struct compat_timespec __user *tsp, compat_sigset_t __user *sigmask,
- compat_size_t sigsetsize)
-{
- compat_sigset_t ss32;
- sigset_t ksigmask, sigsaved;
- struct compat_timespec ts;
- struct timespec end_time, *to = NULL;
- int ret;
-
- if (tsp) {
- if (copy_from_user(&ts, tsp, sizeof(ts)))
- return -EFAULT;
-
- to = &end_time;
- if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
- return -EINVAL;
- }
-
- if (sigmask) {
- if (sigsetsize != sizeof(compat_sigset_t))
- return -EINVAL;
- if (copy_from_user(&ss32, sigmask, sizeof(ss32)))
- return -EFAULT;
- sigset_from_compat(&ksigmask, &ss32);
-
- sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
- sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
- }
-
- ret = compat_core_sys_select(n, inp, outp, exp, to);
- ret = poll_select_copy_remaining(&end_time, tsp, 0, ret);
-
- if (ret == -ERESTARTNOHAND) {
- /*
- * Don't restore the signal mask yet. Let do_signal() deliver
- * the signal on the way back to userspace, before the signal
- * mask is restored.
- */
- if (sigmask) {
- memcpy(&current->saved_sigmask, &sigsaved,
- sizeof(sigsaved));
- set_restore_sigmask();
- }
- } else if (sigmask)
- sigprocmask(SIG_SETMASK, &sigsaved, NULL);
-
- return ret;
-}
-
-COMPAT_SYSCALL_DEFINE6(pselect6, int, n, compat_ulong_t __user *, inp,
- compat_ulong_t __user *, outp, compat_ulong_t __user *, exp,
- struct compat_timespec __user *, tsp, void __user *, sig)
-{
- compat_size_t sigsetsize = 0;
- compat_uptr_t up = 0;
-
- if (sig) {
- if (!access_ok(VERIFY_READ, sig,
- sizeof(compat_uptr_t)+sizeof(compat_size_t)) ||
- __get_user(up, (compat_uptr_t __user *)sig) ||
- __get_user(sigsetsize,
- (compat_size_t __user *)(sig+sizeof(up))))
- return -EFAULT;
- }
- return do_compat_pselect(n, inp, outp, exp, tsp, compat_ptr(up),
- sigsetsize);
-}
-
-COMPAT_SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds,
- unsigned int, nfds, struct compat_timespec __user *, tsp,
- const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize)
-{
- compat_sigset_t ss32;
- sigset_t ksigmask, sigsaved;
- struct compat_timespec ts;
- struct timespec end_time, *to = NULL;
- int ret;
-
- if (tsp) {
- if (copy_from_user(&ts, tsp, sizeof(ts)))
- return -EFAULT;
-
- to = &end_time;
- if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
- return -EINVAL;
- }
-
- if (sigmask) {
- if (sigsetsize != sizeof(compat_sigset_t))
- return -EINVAL;
- if (copy_from_user(&ss32, sigmask, sizeof(ss32)))
- return -EFAULT;
- sigset_from_compat(&ksigmask, &ss32);
-
- sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
- sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
- }
-
- ret = do_sys_poll(ufds, nfds, to);
-
- /* We can restart this syscall, usually */
- if (ret == -EINTR) {
- /*
- * Don't restore the signal mask yet. Let do_signal() deliver
- * the signal on the way back to userspace, before the signal
- * mask is restored.
- */
- if (sigmask) {
- memcpy(&current->saved_sigmask, &sigsaved,
- sizeof(sigsaved));
- set_restore_sigmask();
- }
- ret = -ERESTARTNOHAND;
- } else if (sigmask)
- sigprocmask(SIG_SETMASK, &sigsaved, NULL);
-
- ret = poll_select_copy_remaining(&end_time, tsp, 0, ret);
-
- return ret;
-}
-
-#ifdef CONFIG_FHANDLE
-/*
- * Exactly like fs/open.c:sys_open_by_handle_at(), except that it
- * doesn't set the O_LARGEFILE flag.
- */
-COMPAT_SYSCALL_DEFINE3(open_by_handle_at, int, mountdirfd,
- struct file_handle __user *, handle, int, flags)
-{
- return do_handle_open(mountdirfd, handle, flags);
-}
-#endif
diff --git a/fs/dax.c b/fs/dax.c
index 85abd741253d..43bbd6d1037d 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -55,32 +55,6 @@ static int __init init_dax_wait_table(void)
}
fs_initcall(init_dax_wait_table);
-static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax)
-{
- struct request_queue *q = bdev->bd_queue;
- long rc = -EIO;
-
- dax->addr = ERR_PTR(-EIO);
- if (blk_queue_enter(q, true) != 0)
- return rc;
-
- rc = bdev_direct_access(bdev, dax);
- if (rc < 0) {
- dax->addr = ERR_PTR(rc);
- blk_queue_exit(q);
- return rc;
- }
- return rc;
-}
-
-static void dax_unmap_atomic(struct block_device *bdev,
- const struct blk_dax_ctl *dax)
-{
- if (IS_ERR(dax->addr))
- return;
- blk_queue_exit(bdev->bd_queue);
-}
-
static int dax_is_pmd_entry(void *entry)
{
return (unsigned long)entry & RADIX_DAX_PMD;
@@ -101,26 +75,6 @@ static int dax_is_empty_entry(void *entry)
return (unsigned long)entry & RADIX_DAX_EMPTY;
}
-struct page *read_dax_sector(struct block_device *bdev, sector_t n)
-{
- struct page *page = alloc_pages(GFP_KERNEL, 0);
- struct blk_dax_ctl dax = {
- .size = PAGE_SIZE,
- .sector = n & ~((((int) PAGE_SIZE) / 512) - 1),
- };
- long rc;
-
- if (!page)
- return ERR_PTR(-ENOMEM);
-
- rc = dax_map_atomic(bdev, &dax);
- if (rc < 0)
- return ERR_PTR(rc);
- memcpy_from_pmem(page_address(page), dax.addr, PAGE_SIZE);
- dax_unmap_atomic(bdev, &dax);
- return page;
-}
-
/*
* DAX radix tree locking
*/
@@ -582,21 +536,30 @@ static int dax_load_hole(struct address_space *mapping, void **entry,
return ret;
}
-static int copy_user_dax(struct block_device *bdev, sector_t sector, size_t size,
- struct page *to, unsigned long vaddr)
+static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev,
+ sector_t sector, size_t size, struct page *to,
+ unsigned long vaddr)
{
- struct blk_dax_ctl dax = {
- .sector = sector,
- .size = size,
- };
- void *vto;
-
- if (dax_map_atomic(bdev, &dax) < 0)
- return PTR_ERR(dax.addr);
+ void *vto, *kaddr;
+ pgoff_t pgoff;
+ pfn_t pfn;
+ long rc;
+ int id;
+
+ rc = bdev_dax_pgoff(bdev, sector, size, &pgoff);
+ if (rc)
+ return rc;
+
+ id = dax_read_lock();
+ rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, &pfn);
+ if (rc < 0) {
+ dax_read_unlock(id);
+ return rc;
+ }
vto = kmap_atomic(to);
- copy_user_page(vto, (void __force *)dax.addr, vaddr, to);
+ copy_user_page(vto, (void __force *)kaddr, vaddr, to);
kunmap_atomic(vto);
- dax_unmap_atomic(bdev, &dax);
+ dax_read_unlock(id);
return 0;
}
@@ -764,12 +727,16 @@ unlock_pte:
}
static int dax_writeback_one(struct block_device *bdev,
- struct address_space *mapping, pgoff_t index, void *entry)
+ struct dax_device *dax_dev, struct address_space *mapping,
+ pgoff_t index, void *entry)
{
struct radix_tree_root *page_tree = &mapping->page_tree;
- struct blk_dax_ctl dax;
- void *entry2, **slot;
- int ret = 0;
+ void *entry2, **slot, *kaddr;
+ long ret = 0, id;
+ sector_t sector;
+ pgoff_t pgoff;
+ size_t size;
+ pfn_t pfn;
/*
* A page got tagged dirty in DAX mapping? Something is seriously
@@ -818,26 +785,29 @@ static int dax_writeback_one(struct block_device *bdev,
* 'entry'. This allows us to flush for PMD_SIZE and not have to
* worry about partial PMD writebacks.
*/
- dax.sector = dax_radix_sector(entry);
- dax.size = PAGE_SIZE << dax_radix_order(entry);
+ sector = dax_radix_sector(entry);
+ size = PAGE_SIZE << dax_radix_order(entry);
+
+ id = dax_read_lock();
+ ret = bdev_dax_pgoff(bdev, sector, size, &pgoff);
+ if (ret)
+ goto dax_unlock;
/*
- * We cannot hold tree_lock while calling dax_map_atomic() because it
- * eventually calls cond_resched().
+ * dax_direct_access() may sleep, so cannot hold tree_lock over
+ * its invocation.
*/
- ret = dax_map_atomic(bdev, &dax);
- if (ret < 0) {
- put_locked_mapping_entry(mapping, index, entry);
- return ret;
- }
+ ret = dax_direct_access(dax_dev, pgoff, size / PAGE_SIZE, &kaddr, &pfn);
+ if (ret < 0)
+ goto dax_unlock;
- if (WARN_ON_ONCE(ret < dax.size)) {
+ if (WARN_ON_ONCE(ret < size / PAGE_SIZE)) {
ret = -EIO;
- goto unmap;
+ goto dax_unlock;
}
- dax_mapping_entry_mkclean(mapping, index, pfn_t_to_pfn(dax.pfn));
- wb_cache_pmem(dax.addr, dax.size);
+ dax_mapping_entry_mkclean(mapping, index, pfn_t_to_pfn(pfn));
+ wb_cache_pmem(kaddr, size);
/*
* After we have flushed the cache, we can clear the dirty tag. There
* cannot be new dirty data in the pfn after the flush has completed as
@@ -847,8 +817,8 @@ static int dax_writeback_one(struct block_device *bdev,
spin_lock_irq(&mapping->tree_lock);
radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_DIRTY);
spin_unlock_irq(&mapping->tree_lock);
- unmap:
- dax_unmap_atomic(bdev, &dax);
+ dax_unlock:
+ dax_read_unlock(id);
put_locked_mapping_entry(mapping, index, entry);
return ret;
@@ -869,6 +839,7 @@ int dax_writeback_mapping_range(struct address_space *mapping,
struct inode *inode = mapping->host;
pgoff_t start_index, end_index;
pgoff_t indices[PAGEVEC_SIZE];
+ struct dax_device *dax_dev;
struct pagevec pvec;
bool done = false;
int i, ret = 0;
@@ -879,6 +850,10 @@ int dax_writeback_mapping_range(struct address_space *mapping,
if (!mapping->nrexceptional || wbc->sync_mode != WB_SYNC_ALL)
return 0;
+ dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
+ if (!dax_dev)
+ return -EIO;
+
start_index = wbc->range_start >> PAGE_SHIFT;
end_index = wbc->range_end >> PAGE_SHIFT;
@@ -899,38 +874,49 @@ int dax_writeback_mapping_range(struct address_space *mapping,
break;
}
- ret = dax_writeback_one(bdev, mapping, indices[i],
- pvec.pages[i]);
- if (ret < 0)
+ ret = dax_writeback_one(bdev, dax_dev, mapping,
+ indices[i], pvec.pages[i]);
+ if (ret < 0) {
+ put_dax(dax_dev);
return ret;
+ }
}
}
+ put_dax(dax_dev);
return 0;
}
EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
static int dax_insert_mapping(struct address_space *mapping,
- struct block_device *bdev, sector_t sector, size_t size,
- void **entryp, struct vm_area_struct *vma, struct vm_fault *vmf)
+ struct block_device *bdev, struct dax_device *dax_dev,
+ sector_t sector, size_t size, void **entryp,
+ struct vm_area_struct *vma, struct vm_fault *vmf)
{
unsigned long vaddr = vmf->address;
- struct blk_dax_ctl dax = {
- .sector = sector,
- .size = size,
- };
- void *ret;
void *entry = *entryp;
+ void *ret, *kaddr;
+ pgoff_t pgoff;
+ int id, rc;
+ pfn_t pfn;
- if (dax_map_atomic(bdev, &dax) < 0)
- return PTR_ERR(dax.addr);
- dax_unmap_atomic(bdev, &dax);
+ rc = bdev_dax_pgoff(bdev, sector, size, &pgoff);
+ if (rc)
+ return rc;
- ret = dax_insert_mapping_entry(mapping, vmf, entry, dax.sector, 0);
+ id = dax_read_lock();
+ rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, &pfn);
+ if (rc < 0) {
+ dax_read_unlock(id);
+ return rc;
+ }
+ dax_read_unlock(id);
+
+ ret = dax_insert_mapping_entry(mapping, vmf, entry, sector, 0);
if (IS_ERR(ret))
return PTR_ERR(ret);
*entryp = ret;
- return vm_insert_mixed(vma, vaddr, dax.pfn);
+ return vm_insert_mixed(vma, vaddr, pfn);
}
/**
@@ -979,24 +965,34 @@ static bool dax_range_is_aligned(struct block_device *bdev,
return true;
}
-int __dax_zero_page_range(struct block_device *bdev, sector_t sector,
- unsigned int offset, unsigned int length)
+int __dax_zero_page_range(struct block_device *bdev,
+ struct dax_device *dax_dev, sector_t sector,
+ unsigned int offset, unsigned int size)
{
- struct blk_dax_ctl dax = {
- .sector = sector,
- .size = PAGE_SIZE,
- };
-
- if (dax_range_is_aligned(bdev, offset, length)) {
- sector_t start_sector = dax.sector + (offset >> 9);
+ if (dax_range_is_aligned(bdev, offset, size)) {
+ sector_t start_sector = sector + (offset >> 9);
return blkdev_issue_zeroout(bdev, start_sector,
- length >> 9, GFP_NOFS, true);
+ size >> 9, GFP_NOFS, 0);
} else {
- if (dax_map_atomic(bdev, &dax) < 0)
- return PTR_ERR(dax.addr);
- clear_pmem(dax.addr + offset, length);
- dax_unmap_atomic(bdev, &dax);
+ pgoff_t pgoff;
+ long rc, id;
+ void *kaddr;
+ pfn_t pfn;
+
+ rc = bdev_dax_pgoff(bdev, sector, size, &pgoff);
+ if (rc)
+ return rc;
+
+ id = dax_read_lock();
+ rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr,
+ &pfn);
+ if (rc < 0) {
+ dax_read_unlock(id);
+ return rc;
+ }
+ clear_pmem(kaddr + offset, size);
+ dax_read_unlock(id);
}
return 0;
}
@@ -1011,9 +1007,12 @@ static loff_t
dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
struct iomap *iomap)
{
+ struct block_device *bdev = iomap->bdev;
+ struct dax_device *dax_dev = iomap->dax_dev;
struct iov_iter *iter = data;
loff_t end = pos + length, done = 0;
ssize_t ret = 0;
+ int id;
if (iov_iter_rw(iter) == READ) {
end = min(end, i_size_read(inode));
@@ -1038,34 +1037,42 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
(end - 1) >> PAGE_SHIFT);
}
+ id = dax_read_lock();
while (pos < end) {
unsigned offset = pos & (PAGE_SIZE - 1);
- struct blk_dax_ctl dax = { 0 };
+ const size_t size = ALIGN(length + offset, PAGE_SIZE);
+ const sector_t sector = dax_iomap_sector(iomap, pos);
ssize_t map_len;
+ pgoff_t pgoff;
+ void *kaddr;
+ pfn_t pfn;
if (fatal_signal_pending(current)) {
ret = -EINTR;
break;
}
- dax.sector = dax_iomap_sector(iomap, pos);
- dax.size = (length + offset + PAGE_SIZE - 1) & PAGE_MASK;
- map_len = dax_map_atomic(iomap->bdev, &dax);
+ ret = bdev_dax_pgoff(bdev, sector, size, &pgoff);
+ if (ret)
+ break;
+
+ map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size),
+ &kaddr, &pfn);
if (map_len < 0) {
ret = map_len;
break;
}
- dax.addr += offset;
+ map_len = PFN_PHYS(map_len);
+ kaddr += offset;
map_len -= offset;
if (map_len > end - pos)
map_len = end - pos;
if (iov_iter_rw(iter) == WRITE)
- map_len = copy_from_iter_pmem(dax.addr, map_len, iter);
+ map_len = copy_from_iter_pmem(kaddr, map_len, iter);
else
- map_len = copy_to_iter(dax.addr, map_len, iter);
- dax_unmap_atomic(iomap->bdev, &dax);
+ map_len = copy_to_iter(kaddr, map_len, iter);
if (map_len <= 0) {
ret = map_len ? map_len : -EFAULT;
break;
@@ -1075,6 +1082,7 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
length -= map_len;
done += map_len;
}
+ dax_read_unlock(id);
return done ? done : ret;
}
@@ -1181,8 +1189,8 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
clear_user_highpage(vmf->cow_page, vaddr);
break;
case IOMAP_MAPPED:
- error = copy_user_dax(iomap.bdev, sector, PAGE_SIZE,
- vmf->cow_page, vaddr);
+ error = copy_user_dax(iomap.bdev, iomap.dax_dev,
+ sector, PAGE_SIZE, vmf->cow_page, vaddr);
break;
default:
WARN_ON_ONCE(1);
@@ -1207,8 +1215,8 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
mem_cgroup_count_vm_event(vmf->vma->vm_mm, PGMAJFAULT);
major = VM_FAULT_MAJOR;
}
- error = dax_insert_mapping(mapping, iomap.bdev, sector,
- PAGE_SIZE, &entry, vmf->vma, vmf);
+ error = dax_insert_mapping(mapping, iomap.bdev, iomap.dax_dev,
+ sector, PAGE_SIZE, &entry, vmf->vma, vmf);
/* -EBUSY is fine, somebody else faulted on the same PTE */
if (error == -EBUSY)
error = 0;
@@ -1258,41 +1266,48 @@ static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap,
loff_t pos, void **entryp)
{
struct address_space *mapping = vmf->vma->vm_file->f_mapping;
+ const sector_t sector = dax_iomap_sector(iomap, pos);
+ struct dax_device *dax_dev = iomap->dax_dev;
struct block_device *bdev = iomap->bdev;
struct inode *inode = mapping->host;
- struct blk_dax_ctl dax = {
- .sector = dax_iomap_sector(iomap, pos),
- .size = PMD_SIZE,
- };
- long length = dax_map_atomic(bdev, &dax);
- void *ret = NULL;
-
- if (length < 0) /* dax_map_atomic() failed */
+ const size_t size = PMD_SIZE;
+ void *ret = NULL, *kaddr;
+ long length = 0;
+ pgoff_t pgoff;
+ pfn_t pfn;
+ int id;
+
+ if (bdev_dax_pgoff(bdev, sector, size, &pgoff) != 0)
goto fallback;
- if (length < PMD_SIZE)
- goto unmap_fallback;
- if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR)
- goto unmap_fallback;
- if (!pfn_t_devmap(dax.pfn))
- goto unmap_fallback;
-
- dax_unmap_atomic(bdev, &dax);
- ret = dax_insert_mapping_entry(mapping, vmf, *entryp, dax.sector,
+ id = dax_read_lock();
+ length = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, &pfn);
+ if (length < 0)
+ goto unlock_fallback;
+ length = PFN_PHYS(length);
+
+ if (length < size)
+ goto unlock_fallback;
+ if (pfn_t_to_pfn(pfn) & PG_PMD_COLOUR)
+ goto unlock_fallback;
+ if (!pfn_t_devmap(pfn))
+ goto unlock_fallback;
+ dax_read_unlock(id);
+
+ ret = dax_insert_mapping_entry(mapping, vmf, *entryp, sector,
RADIX_DAX_PMD);
if (IS_ERR(ret))
goto fallback;
*entryp = ret;
- trace_dax_pmd_insert_mapping(inode, vmf, length, dax.pfn, ret);
+ trace_dax_pmd_insert_mapping(inode, vmf, length, pfn, ret);
return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd,
- dax.pfn, vmf->flags & FAULT_FLAG_WRITE);
+ pfn, vmf->flags & FAULT_FLAG_WRITE);
- unmap_fallback:
- dax_unmap_atomic(bdev, &dax);
+unlock_fallback:
+ dax_read_unlock(id);
fallback:
- trace_dax_pmd_insert_mapping_fallback(inode, vmf, length,
- dax.pfn, ret);
+ trace_dax_pmd_insert_mapping_fallback(inode, vmf, length, pfn, ret);
return VM_FAULT_FALLBACK;
}
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 95c1c8d34539..9c351bf757b2 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -349,7 +349,6 @@ struct ecryptfs_mount_crypt_stat {
struct ecryptfs_sb_info {
struct super_block *wsi_sb;
struct ecryptfs_mount_crypt_stat mount_crypt_stat;
- struct backing_dev_info bdi;
};
/* file private data. */
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 151872dcc1f4..9014479d0160 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -519,12 +519,11 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags
goto out;
}
- rc = bdi_setup_and_register(&sbi->bdi, "ecryptfs");
+ rc = super_setup_bdi(s);
if (rc)
goto out1;
ecryptfs_set_superblock_private(s, sbi);
- s->s_bdi = &sbi->bdi;
/* ->kill_sb() will take care of sbi after that point */
sbi = NULL;
@@ -633,7 +632,6 @@ static void ecryptfs_kill_block_super(struct super_block *sb)
if (!sb_info)
return;
ecryptfs_destroy_mount_crypt_stat(&sb_info->mount_crypt_stat);
- bdi_destroy(&sb_info->bdi);
kmem_cache_free(ecryptfs_sb_info_cache, sb_info);
}
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 341251421ced..5420767c9b68 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -42,6 +42,7 @@
#include <linux/seq_file.h>
#include <linux/compat.h>
#include <linux/rculist.h>
+#include <net/busy_poll.h>
/*
* LOCKING:
@@ -224,6 +225,11 @@ struct eventpoll {
/* used to optimize loop detection check */
int visited;
struct list_head visited_list_link;
+
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ /* used to track busy poll napi_id */
+ unsigned int napi_id;
+#endif
};
/* Wait structure used by the poll hooks */
@@ -384,6 +390,77 @@ static inline int ep_events_available(struct eventpoll *ep)
return !list_empty(&ep->rdllist) || ep->ovflist != EP_UNACTIVE_PTR;
}
+#ifdef CONFIG_NET_RX_BUSY_POLL
+static bool ep_busy_loop_end(void *p, unsigned long start_time)
+{
+ struct eventpoll *ep = p;
+
+ return ep_events_available(ep) || busy_loop_timeout(start_time);
+}
+#endif /* CONFIG_NET_RX_BUSY_POLL */
+
+/*
+ * Busy poll if globally on and supporting sockets found && no events,
+ * busy loop will return if need_resched or ep_events_available.
+ *
+ * we must do our busy polling with irqs enabled
+ */
+static void ep_busy_loop(struct eventpoll *ep, int nonblock)
+{
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ unsigned int napi_id = READ_ONCE(ep->napi_id);
+
+ if ((napi_id >= MIN_NAPI_ID) && net_busy_loop_on())
+ napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, ep);
+#endif
+}
+
+static inline void ep_reset_busy_poll_napi_id(struct eventpoll *ep)
+{
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ if (ep->napi_id)
+ ep->napi_id = 0;
+#endif
+}
+
+/*
+ * Set epoll busy poll NAPI ID from sk.
+ */
+static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
+{
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ struct eventpoll *ep;
+ unsigned int napi_id;
+ struct socket *sock;
+ struct sock *sk;
+ int err;
+
+ if (!net_busy_loop_on())
+ return;
+
+ sock = sock_from_file(epi->ffd.file, &err);
+ if (!sock)
+ return;
+
+ sk = sock->sk;
+ if (!sk)
+ return;
+
+ napi_id = READ_ONCE(sk->sk_napi_id);
+ ep = epi->ep;
+
+ /* Non-NAPI IDs can be rejected
+ * or
+ * Nothing to do if we already have this ID
+ */
+ if (napi_id < MIN_NAPI_ID || napi_id == ep->napi_id)
+ return;
+
+ /* record NAPI ID for use in next busy poll */
+ ep->napi_id = napi_id;
+#endif
+}
+
/**
* ep_call_nested - Perform a bound (possibly) nested call, by checking
* that the recursion limit is not exceeded, and that
@@ -1022,6 +1099,8 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k
spin_lock_irqsave(&ep->lock, flags);
+ ep_set_busy_poll_napi_id(epi);
+
/*
* If the event mask does not contain any poll(2) event, we consider the
* descriptor to be disabled. This condition is likely the effect of the
@@ -1363,6 +1442,9 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
/* We have to drop the new item inside our item list to keep track of it */
spin_lock_irqsave(&ep->lock, flags);
+ /* record NAPI ID of new item if present */
+ ep_set_busy_poll_napi_id(epi);
+
/* If the file is already "ready" we drop it inside the ready list */
if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) {
list_add_tail(&epi->rdllink, &ep->rdllist);
@@ -1637,10 +1719,21 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
}
fetch_events:
+
+ if (!ep_events_available(ep))
+ ep_busy_loop(ep, timed_out);
+
spin_lock_irqsave(&ep->lock, flags);
if (!ep_events_available(ep)) {
/*
+ * Busy poll timed out. Drop NAPI ID for now, we can add
+ * it back in when we have moved a socket with a valid NAPI
+ * ID onto the ready list.
+ */
+ ep_reset_busy_poll_napi_id(ep);
+
+ /*
* We don't have any available event to return to the caller.
* We need to sleep here, and we will be wake up by
* ep_poll_callback() when events will become available.
diff --git a/fs/exec.c b/fs/exec.c
index 65145a3df065..72934df68471 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1320,6 +1320,7 @@ void setup_new_exec(struct linux_binprm * bprm)
else
set_dumpable(current->mm, suid_dumpable);
+ arch_setup_new_exec();
perf_event_exec();
__set_task_comm(current, kbasename(bprm->filename), true);
diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index 2e86086bc940..5dc392404559 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -64,7 +64,6 @@ struct exofs_dev {
* our extension to the in-memory superblock
*/
struct exofs_sb_info {
- struct backing_dev_info bdi; /* register our bdi with VFS */
struct exofs_sb_stats s_ess; /* Written often, pre-allocate*/
int s_timeout; /* timeout for OSD operations */
uint64_t s_nextid; /* highest object ID used */
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 1076a4233b39..819624cfc8da 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -464,7 +464,6 @@ static void exofs_put_super(struct super_block *sb)
sbi->one_comp.obj.partition);
exofs_sysfs_sb_del(sbi);
- bdi_destroy(&sbi->bdi);
exofs_free_sbi(sbi);
sb->s_fs_info = NULL;
}
@@ -809,8 +808,12 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
__sbi_read_stats(sbi);
/* set up operation vectors */
- sbi->bdi.ra_pages = __ra_pages(&sbi->layout);
- sb->s_bdi = &sbi->bdi;
+ ret = super_setup_bdi(sb);
+ if (ret) {
+ EXOFS_DBGMSG("Failed to super_setup_bdi\n");
+ goto free_sbi;
+ }
+ sb->s_bdi->ra_pages = __ra_pages(&sbi->layout);
sb->s_fs_info = sbi;
sb->s_op = &exofs_sops;
sb->s_export_op = &exofs_export_ops;
@@ -836,14 +839,6 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
goto free_sbi;
}
- ret = bdi_setup_and_register(&sbi->bdi, "exofs");
- if (ret) {
- EXOFS_DBGMSG("Failed to bdi_setup_and_register\n");
- dput(sb->s_root);
- sb->s_root = NULL;
- goto free_sbi;
- }
-
exofs_sysfs_dbg_print();
_exofs_print_device("Mounting", opts->dev_name,
ore_comp_dev(&sbi->oc, 0),
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 5e64de9c5093..03f5ce1d3dbe 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -779,7 +779,6 @@ extern void ext2_evict_inode(struct inode *);
extern int ext2_get_block(struct inode *, sector_t, struct buffer_head *, int);
extern int ext2_setattr (struct dentry *, struct iattr *);
extern void ext2_set_inode_flags(struct inode *inode);
-extern void ext2_get_inode_flags(struct ext2_inode_info *);
extern int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len);
@@ -796,7 +795,8 @@ void ext2_error(struct super_block *, const char *, const char *, ...);
extern __printf(3, 4)
void ext2_msg(struct super_block *, const char *, const char *, ...);
extern void ext2_update_dynamic_rev (struct super_block *sb);
-extern void ext2_write_super (struct super_block *);
+extern void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es,
+ int wait);
/*
* Inodes and files operations
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 128cce540645..26d77f9f8c12 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -799,6 +799,7 @@ int ext2_get_block(struct inode *inode, sector_t iblock,
static int ext2_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
unsigned flags, struct iomap *iomap)
{
+ struct block_device *bdev;
unsigned int blkbits = inode->i_blkbits;
unsigned long first_block = offset >> blkbits;
unsigned long max_blocks = (length + (1 << blkbits) - 1) >> blkbits;
@@ -812,8 +813,13 @@ static int ext2_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
return ret;
iomap->flags = 0;
- iomap->bdev = inode->i_sb->s_bdev;
+ bdev = inode->i_sb->s_bdev;
+ iomap->bdev = bdev;
iomap->offset = (u64)first_block << blkbits;
+ if (blk_queue_dax(bdev->bd_queue))
+ iomap->dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
+ else
+ iomap->dax_dev = NULL;
if (ret == 0) {
iomap->type = IOMAP_HOLE;
@@ -835,6 +841,7 @@ static int
ext2_iomap_end(struct inode *inode, loff_t offset, loff_t length,
ssize_t written, unsigned flags, struct iomap *iomap)
{
+ put_dax(iomap->dax_dev);
if (iomap->type == IOMAP_MAPPED &&
written < length &&
(flags & IOMAP_WRITE))
@@ -1384,25 +1391,6 @@ void ext2_set_inode_flags(struct inode *inode)
inode->i_flags |= S_DAX;
}
-/* Propagate flags from i_flags to EXT2_I(inode)->i_flags */
-void ext2_get_inode_flags(struct ext2_inode_info *ei)
-{
- unsigned int flags = ei->vfs_inode.i_flags;
-
- ei->i_flags &= ~(EXT2_SYNC_FL|EXT2_APPEND_FL|
- EXT2_IMMUTABLE_FL|EXT2_NOATIME_FL|EXT2_DIRSYNC_FL);
- if (flags & S_SYNC)
- ei->i_flags |= EXT2_SYNC_FL;
- if (flags & S_APPEND)
- ei->i_flags |= EXT2_APPEND_FL;
- if (flags & S_IMMUTABLE)
- ei->i_flags |= EXT2_IMMUTABLE_FL;
- if (flags & S_NOATIME)
- ei->i_flags |= EXT2_NOATIME_FL;
- if (flags & S_DIRSYNC)
- ei->i_flags |= EXT2_DIRSYNC_FL;
-}
-
struct inode *ext2_iget (struct super_block *sb, unsigned long ino)
{
struct ext2_inode_info *ei;
@@ -1563,7 +1551,6 @@ static int __ext2_write_inode(struct inode *inode, int do_sync)
if (ei->i_state & EXT2_STATE_NEW)
memset(raw_inode, 0, EXT2_SB(sb)->s_inode_size);
- ext2_get_inode_flags(ei);
raw_inode->i_mode = cpu_to_le16(inode->i_mode);
if (!(test_opt(sb, NO_UID32))) {
raw_inode->i_uid_low = cpu_to_le16(low_16_bits(uid));
@@ -1615,7 +1602,7 @@ static int __ext2_write_inode(struct inode *inode, int do_sync)
EXT2_SET_RO_COMPAT_FEATURE(sb,
EXT2_FEATURE_RO_COMPAT_LARGE_FILE);
spin_unlock(&EXT2_SB(sb)->s_lock);
- ext2_write_super(sb);
+ ext2_sync_super(sb, EXT2_SB(sb)->s_es, 1);
}
}
}
diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c
index 191e02b28ce8..087f122cca42 100644
--- a/fs/ext2/ioctl.c
+++ b/fs/ext2/ioctl.c
@@ -29,7 +29,6 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
switch (cmd) {
case EXT2_IOC_GETFLAGS:
- ext2_get_inode_flags(ei);
flags = ei->i_flags & EXT2_FL_USER_VISIBLE;
return put_user(flags, (int __user *) arg);
case EXT2_IOC_SETFLAGS: {
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 9e25a71fe1a2..8ac673c71a36 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -36,8 +36,7 @@
#include "xattr.h"
#include "acl.h"
-static void ext2_sync_super(struct super_block *sb,
- struct ext2_super_block *es, int wait);
+static void ext2_write_super(struct super_block *sb);
static int ext2_remount (struct super_block * sb, int * flags, char * data);
static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf);
static int ext2_sync_fs(struct super_block *sb, int wait);
@@ -123,13 +122,29 @@ void ext2_update_dynamic_rev(struct super_block *sb)
*/
}
+#ifdef CONFIG_QUOTA
+static int ext2_quota_off(struct super_block *sb, int type);
+
+static void ext2_quota_off_umount(struct super_block *sb)
+{
+ int type;
+
+ for (type = 0; type < MAXQUOTAS; type++)
+ ext2_quota_off(sb, type);
+}
+#else
+static inline void ext2_quota_off_umount(struct super_block *sb)
+{
+}
+#endif
+
static void ext2_put_super (struct super_block * sb)
{
int db_count;
int i;
struct ext2_sb_info *sbi = EXT2_SB(sb);
- dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
+ ext2_quota_off_umount(sb);
if (sbi->s_mb_cache) {
ext2_xattr_destroy_cache(sbi->s_mb_cache);
@@ -314,10 +329,23 @@ static int ext2_show_options(struct seq_file *seq, struct dentry *root)
#ifdef CONFIG_QUOTA
static ssize_t ext2_quota_read(struct super_block *sb, int type, char *data, size_t len, loff_t off);
static ssize_t ext2_quota_write(struct super_block *sb, int type, const char *data, size_t len, loff_t off);
+static int ext2_quota_on(struct super_block *sb, int type, int format_id,
+ const struct path *path);
static struct dquot **ext2_get_dquots(struct inode *inode)
{
return EXT2_I(inode)->i_dquot;
}
+
+static const struct quotactl_ops ext2_quotactl_ops = {
+ .quota_on = ext2_quota_on,
+ .quota_off = ext2_quota_off,
+ .quota_sync = dquot_quota_sync,
+ .get_state = dquot_get_state,
+ .set_info = dquot_set_dqinfo,
+ .get_dqblk = dquot_get_dqblk,
+ .set_dqblk = dquot_set_dqblk,
+ .get_nextdqblk = dquot_get_next_dqblk,
+};
#endif
static const struct super_operations ext2_sops = {
@@ -1117,7 +1145,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
#ifdef CONFIG_QUOTA
sb->dq_op = &dquot_operations;
- sb->s_qcop = &dquot_quotactl_ops;
+ sb->s_qcop = &ext2_quotactl_ops;
sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP;
#endif
@@ -1194,8 +1222,8 @@ static void ext2_clear_super_error(struct super_block *sb)
}
}
-static void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es,
- int wait)
+void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es,
+ int wait)
{
ext2_clear_super_error(sb);
spin_lock(&EXT2_SB(sb)->s_lock);
@@ -1270,7 +1298,7 @@ static int ext2_unfreeze(struct super_block *sb)
return 0;
}
-void ext2_write_super(struct super_block *sb)
+static void ext2_write_super(struct super_block *sb)
{
if (!(sb->s_flags & MS_RDONLY))
ext2_sync_fs(sb, 1);
@@ -1548,6 +1576,51 @@ out:
return len - towrite;
}
+static int ext2_quota_on(struct super_block *sb, int type, int format_id,
+ const struct path *path)
+{
+ int err;
+ struct inode *inode;
+
+ err = dquot_quota_on(sb, type, format_id, path);
+ if (err)
+ return err;
+
+ inode = d_inode(path->dentry);
+ inode_lock(inode);
+ EXT2_I(inode)->i_flags |= EXT2_NOATIME_FL | EXT2_IMMUTABLE_FL;
+ inode_set_flags(inode, S_NOATIME | S_IMMUTABLE,
+ S_NOATIME | S_IMMUTABLE);
+ inode_unlock(inode);
+ mark_inode_dirty(inode);
+
+ return 0;
+}
+
+static int ext2_quota_off(struct super_block *sb, int type)
+{
+ struct inode *inode = sb_dqopt(sb)->files[type];
+ int err;
+
+ if (!inode || !igrab(inode))
+ goto out;
+
+ err = dquot_quota_off(sb, type);
+ if (err)
+ goto out_put;
+
+ inode_lock(inode);
+ EXT2_I(inode)->i_flags &= ~(EXT2_NOATIME_FL | EXT2_IMMUTABLE_FL);
+ inode_set_flags(inode, 0, S_NOATIME | S_IMMUTABLE);
+ inode_unlock(inode);
+ mark_inode_dirty(inode);
+out_put:
+ iput(inode);
+ return err;
+out:
+ return dquot_quota_off(sb, type);
+}
+
#endif
static struct file_system_type ext2_fs_type = {
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 13742d6c83d2..8e8046104f4d 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2476,7 +2476,6 @@ extern int ext4_truncate(struct inode *);
extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length);
extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks);
extern void ext4_set_inode_flags(struct inode *);
-extern void ext4_get_inode_flags(struct ext4_inode_info *);
extern int ext4_alloc_da_blocks(struct inode *inode);
extern void ext4_set_aops(struct inode *inode);
extern int ext4_writepage_trans_blocks(struct inode *);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 2a4507deb925..5834c4d76be8 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3340,6 +3340,7 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
unsigned flags, struct iomap *iomap)
{
+ struct block_device *bdev;
unsigned int blkbits = inode->i_blkbits;
unsigned long first_block = offset >> blkbits;
unsigned long last_block = (offset + length - 1) >> blkbits;
@@ -3408,7 +3409,12 @@ retry:
}
iomap->flags = 0;
- iomap->bdev = inode->i_sb->s_bdev;
+ bdev = inode->i_sb->s_bdev;
+ iomap->bdev = bdev;
+ if (blk_queue_dax(bdev->bd_queue))
+ iomap->dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
+ else
+ iomap->dax_dev = NULL;
iomap->offset = first_block << blkbits;
if (ret == 0) {
@@ -3441,6 +3447,7 @@ static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length,
int blkbits = inode->i_blkbits;
bool truncate = false;
+ put_dax(iomap->dax_dev);
if (!(flags & IOMAP_WRITE) || (flags & IOMAP_FAULT))
return 0;
@@ -4537,31 +4544,6 @@ void ext4_set_inode_flags(struct inode *inode)
S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX);
}
-/* Propagate flags from i_flags to EXT4_I(inode)->i_flags */
-void ext4_get_inode_flags(struct ext4_inode_info *ei)
-{
- unsigned int vfs_fl;
- unsigned long old_fl, new_fl;
-
- do {
- vfs_fl = ei->vfs_inode.i_flags;
- old_fl = ei->i_flags;
- new_fl = old_fl & ~(EXT4_SYNC_FL|EXT4_APPEND_FL|
- EXT4_IMMUTABLE_FL|EXT4_NOATIME_FL|
- EXT4_DIRSYNC_FL);
- if (vfs_fl & S_SYNC)
- new_fl |= EXT4_SYNC_FL;
- if (vfs_fl & S_APPEND)
- new_fl |= EXT4_APPEND_FL;
- if (vfs_fl & S_IMMUTABLE)
- new_fl |= EXT4_IMMUTABLE_FL;
- if (vfs_fl & S_NOATIME)
- new_fl |= EXT4_NOATIME_FL;
- if (vfs_fl & S_DIRSYNC)
- new_fl |= EXT4_DIRSYNC_FL;
- } while (cmpxchg(&ei->i_flags, old_fl, new_fl) != old_fl);
-}
-
static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
struct ext4_inode_info *ei)
{
@@ -4998,7 +4980,6 @@ static int ext4_do_update_inode(handle_t *handle,
if (ext4_test_inode_state(inode, EXT4_STATE_NEW))
memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
- ext4_get_inode_flags(ei);
raw_inode->i_mode = cpu_to_le16(inode->i_mode);
i_uid = i_uid_read(inode);
i_gid = i_gid_read(inode);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 2779d5f291a4..0c21e22acd74 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -589,7 +589,6 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
case FS_IOC_GETFSMAP:
return ext4_ioc_getfsmap(sb, (void __user *)arg);
case EXT4_IOC_GETFLAGS:
- ext4_get_inode_flags(ei);
flags = ei->i_flags & EXT4_FL_USER_VISIBLE;
return put_user(flags, (int __user *) arg);
case EXT4_IOC_SETFLAGS: {
@@ -977,7 +976,6 @@ resizefs_out:
struct fsxattr fa;
memset(&fa, 0, sizeof(struct fsxattr));
- ext4_get_inode_flags(ei);
fa.fsx_xflags = ext4_iflags_to_xflags(ei->i_flags & EXT4_FL_USER_VISIBLE);
if (ext4_has_feature_project(inode->i_sb)) {
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 931053cf20d6..96973ee74147 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -840,6 +840,28 @@ static void dump_orphan_list(struct super_block *sb, struct ext4_sb_info *sbi)
}
}
+#ifdef CONFIG_QUOTA
+static int ext4_quota_off(struct super_block *sb, int type);
+
+static inline void ext4_quota_off_umount(struct super_block *sb)
+{
+ int type;
+
+ if (ext4_has_feature_quota(sb)) {
+ dquot_disable(sb, -1,
+ DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
+ } else {
+ /* Use our quota_off function to clear inode flags etc. */
+ for (type = 0; type < EXT4_MAXQUOTAS; type++)
+ ext4_quota_off(sb, type);
+ }
+}
+#else
+static inline void ext4_quota_off_umount(struct super_block *sb)
+{
+}
+#endif
+
static void ext4_put_super(struct super_block *sb)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -848,7 +870,7 @@ static void ext4_put_super(struct super_block *sb)
int i, err;
ext4_unregister_li_request(sb);
- dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
+ ext4_quota_off_umount(sb);
flush_workqueue(sbi->rsv_conversion_wq);
destroy_workqueue(sbi->rsv_conversion_wq);
@@ -1219,7 +1241,6 @@ static int ext4_mark_dquot_dirty(struct dquot *dquot);
static int ext4_write_info(struct super_block *sb, int type);
static int ext4_quota_on(struct super_block *sb, int type, int format_id,
const struct path *path);
-static int ext4_quota_off(struct super_block *sb, int type);
static int ext4_quota_on_mount(struct super_block *sb, int type);
static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
size_t len, loff_t off);
@@ -5352,11 +5373,33 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
if (err)
return err;
}
+
lockdep_set_quota_inode(path->dentry->d_inode, I_DATA_SEM_QUOTA);
err = dquot_quota_on(sb, type, format_id, path);
- if (err)
+ if (err) {
lockdep_set_quota_inode(path->dentry->d_inode,
I_DATA_SEM_NORMAL);
+ } else {
+ struct inode *inode = d_inode(path->dentry);
+ handle_t *handle;
+
+ /*
+ * Set inode flags to prevent userspace from messing with quota
+ * files. If this fails, we return success anyway since quotas
+ * are already enabled and this is not a hard failure.
+ */
+ inode_lock(inode);
+ handle = ext4_journal_start(inode, EXT4_HT_QUOTA, 1);
+ if (IS_ERR(handle))
+ goto unlock_inode;
+ EXT4_I(inode)->i_flags |= EXT4_NOATIME_FL | EXT4_IMMUTABLE_FL;
+ inode_set_flags(inode, S_NOATIME | S_IMMUTABLE,
+ S_NOATIME | S_IMMUTABLE);
+ ext4_mark_inode_dirty(handle, inode);
+ ext4_journal_stop(handle);
+ unlock_inode:
+ inode_unlock(inode);
+ }
return err;
}
@@ -5430,24 +5473,39 @@ static int ext4_quota_off(struct super_block *sb, int type)
{
struct inode *inode = sb_dqopt(sb)->files[type];
handle_t *handle;
+ int err;
/* Force all delayed allocation blocks to be allocated.
* Caller already holds s_umount sem */
if (test_opt(sb, DELALLOC))
sync_filesystem(sb);
- if (!inode)
+ if (!inode || !igrab(inode))
goto out;
- /* Update modification times of quota files when userspace can
- * start looking at them */
+ err = dquot_quota_off(sb, type);
+ if (err)
+ goto out_put;
+
+ inode_lock(inode);
+ /*
+ * Update modification times of quota files when userspace can
+ * start looking at them. If we fail, we return success anyway since
+ * this is not a hard failure and quotas are already disabled.
+ */
handle = ext4_journal_start(inode, EXT4_HT_QUOTA, 1);
if (IS_ERR(handle))
- goto out;
+ goto out_unlock;
+ EXT4_I(inode)->i_flags &= ~(EXT4_NOATIME_FL | EXT4_IMMUTABLE_FL);
+ inode_set_flags(inode, 0, S_NOATIME | S_IMMUTABLE);
inode->i_mtime = inode->i_ctime = current_time(inode);
ext4_mark_inode_dirty(handle, inode);
ext4_journal_stop(handle);
-
+out_unlock:
+ inode_unlock(inode);
+out_put:
+ iput(inode);
+ return err;
out:
return dquot_quota_off(sb, type);
}
diff --git a/fs/fcntl.c b/fs/fcntl.c
index be8fbe289087..8bd81c2e89b2 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -23,6 +23,7 @@
#include <linux/pid_namespace.h>
#include <linux/user_namespace.h>
#include <linux/shmem_fs.h>
+#include <linux/compat.h>
#include <asm/poll.h>
#include <asm/siginfo.h>
@@ -420,6 +421,162 @@ out:
}
#endif
+#ifdef CONFIG_COMPAT
+static int get_compat_flock(struct flock *kfl, struct compat_flock __user *ufl)
+{
+ if (!access_ok(VERIFY_READ, ufl, sizeof(*ufl)) ||
+ __get_user(kfl->l_type, &ufl->l_type) ||
+ __get_user(kfl->l_whence, &ufl->l_whence) ||
+ __get_user(kfl->l_start, &ufl->l_start) ||
+ __get_user(kfl->l_len, &ufl->l_len) ||
+ __get_user(kfl->l_pid, &ufl->l_pid))
+ return -EFAULT;
+ return 0;
+}
+
+static int put_compat_flock(struct flock *kfl, struct compat_flock __user *ufl)
+{
+ if (!access_ok(VERIFY_WRITE, ufl, sizeof(*ufl)) ||
+ __put_user(kfl->l_type, &ufl->l_type) ||
+ __put_user(kfl->l_whence, &ufl->l_whence) ||
+ __put_user(kfl->l_start, &ufl->l_start) ||
+ __put_user(kfl->l_len, &ufl->l_len) ||
+ __put_user(kfl->l_pid, &ufl->l_pid))
+ return -EFAULT;
+ return 0;
+}
+
+#ifndef HAVE_ARCH_GET_COMPAT_FLOCK64
+static int get_compat_flock64(struct flock *kfl, struct compat_flock64 __user *ufl)
+{
+ if (!access_ok(VERIFY_READ, ufl, sizeof(*ufl)) ||
+ __get_user(kfl->l_type, &ufl->l_type) ||
+ __get_user(kfl->l_whence, &ufl->l_whence) ||
+ __get_user(kfl->l_start, &ufl->l_start) ||
+ __get_user(kfl->l_len, &ufl->l_len) ||
+ __get_user(kfl->l_pid, &ufl->l_pid))
+ return -EFAULT;
+ return 0;
+}
+#endif
+
+#ifndef HAVE_ARCH_PUT_COMPAT_FLOCK64
+static int put_compat_flock64(struct flock *kfl, struct compat_flock64 __user *ufl)
+{
+ if (!access_ok(VERIFY_WRITE, ufl, sizeof(*ufl)) ||
+ __put_user(kfl->l_type, &ufl->l_type) ||
+ __put_user(kfl->l_whence, &ufl->l_whence) ||
+ __put_user(kfl->l_start, &ufl->l_start) ||
+ __put_user(kfl->l_len, &ufl->l_len) ||
+ __put_user(kfl->l_pid, &ufl->l_pid))
+ return -EFAULT;
+ return 0;
+}
+#endif
+
+static unsigned int
+convert_fcntl_cmd(unsigned int cmd)
+{
+ switch (cmd) {
+ case F_GETLK64:
+ return F_GETLK;
+ case F_SETLK64:
+ return F_SETLK;
+ case F_SETLKW64:
+ return F_SETLKW;
+ }
+
+ return cmd;
+}
+
+COMPAT_SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd,
+ compat_ulong_t, arg)
+{
+ mm_segment_t old_fs;
+ struct flock f;
+ long ret;
+ unsigned int conv_cmd;
+
+ switch (cmd) {
+ case F_GETLK:
+ case F_SETLK:
+ case F_SETLKW:
+ ret = get_compat_flock(&f, compat_ptr(arg));
+ if (ret != 0)
+ break;
+ old_fs = get_fs();
+ set_fs(KERNEL_DS);
+ ret = sys_fcntl(fd, cmd, (unsigned long)&f);
+ set_fs(old_fs);
+ if (cmd == F_GETLK && ret == 0) {
+ /* GETLK was successful and we need to return the data...
+ * but it needs to fit in the compat structure.
+ * l_start shouldn't be too big, unless the original
+ * start + end is greater than COMPAT_OFF_T_MAX, in which
+ * case the app was asking for trouble, so we return
+ * -EOVERFLOW in that case.
+ * l_len could be too big, in which case we just truncate it,
+ * and only allow the app to see that part of the conflicting
+ * lock that might make sense to it anyway
+ */
+
+ if (f.l_start > COMPAT_OFF_T_MAX)
+ ret = -EOVERFLOW;
+ if (f.l_len > COMPAT_OFF_T_MAX)
+ f.l_len = COMPAT_OFF_T_MAX;
+ if (ret == 0)
+ ret = put_compat_flock(&f, compat_ptr(arg));
+ }
+ break;
+
+ case F_GETLK64:
+ case F_SETLK64:
+ case F_SETLKW64:
+ case F_OFD_GETLK:
+ case F_OFD_SETLK:
+ case F_OFD_SETLKW:
+ ret = get_compat_flock64(&f, compat_ptr(arg));
+ if (ret != 0)
+ break;
+ old_fs = get_fs();
+ set_fs(KERNEL_DS);
+ conv_cmd = convert_fcntl_cmd(cmd);
+ ret = sys_fcntl(fd, conv_cmd, (unsigned long)&f);
+ set_fs(old_fs);
+ if ((conv_cmd == F_GETLK || conv_cmd == F_OFD_GETLK) && ret == 0) {
+ /* need to return lock information - see above for commentary */
+ if (f.l_start > COMPAT_LOFF_T_MAX)
+ ret = -EOVERFLOW;
+ if (f.l_len > COMPAT_LOFF_T_MAX)
+ f.l_len = COMPAT_LOFF_T_MAX;
+ if (ret == 0)
+ ret = put_compat_flock64(&f, compat_ptr(arg));
+ }
+ break;
+
+ default:
+ ret = sys_fcntl(fd, cmd, arg);
+ break;
+ }
+ return ret;
+}
+
+COMPAT_SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd,
+ compat_ulong_t, arg)
+{
+ switch (cmd) {
+ case F_GETLK64:
+ case F_SETLK64:
+ case F_SETLKW64:
+ case F_OFD_GETLK:
+ case F_OFD_SETLK:
+ case F_OFD_SETLKW:
+ return -EINVAL;
+ }
+ return compat_sys_fcntl64(fd, cmd, arg);
+}
+#endif
+
/* Table to convert sigio signal codes into poll band bitmaps */
static const long band_table[NSIGPOLL] = {
diff --git a/fs/fhandle.c b/fs/fhandle.c
index 5559168d5637..58a61f55e0d0 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -9,6 +9,7 @@
#include <linux/fsnotify.h>
#include <linux/personality.h>
#include <linux/uaccess.h>
+#include <linux/compat.h>
#include "internal.h"
#include "mount.h"
@@ -264,3 +265,15 @@ SYSCALL_DEFINE3(open_by_handle_at, int, mountdirfd,
ret = do_handle_open(mountdirfd, handle, flags);
return ret;
}
+
+#ifdef CONFIG_COMPAT
+/*
+ * Exactly like fs/open.c:sys_open_by_handle_at(), except that it
+ * doesn't set the O_LARGEFILE flag.
+ */
+COMPAT_SYSCALL_DEFINE3(open_by_handle_at, int, mountdirfd,
+ struct file_handle __user *, handle, int, flags)
+{
+ return do_handle_open(mountdirfd, handle, flags);
+}
+#endif
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index b681b43c766e..c2d7f3a92679 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -382,9 +382,9 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req)
wake_up(&fc->blocked_waitq);
if (fc->num_background == fc->congestion_threshold &&
- fc->connected && fc->bdi_initialized) {
- clear_bdi_congested(&fc->bdi, BLK_RW_SYNC);
- clear_bdi_congested(&fc->bdi, BLK_RW_ASYNC);
+ fc->connected && fc->sb) {
+ clear_bdi_congested(fc->sb->s_bdi, BLK_RW_SYNC);
+ clear_bdi_congested(fc->sb->s_bdi, BLK_RW_ASYNC);
}
fc->num_background--;
fc->active_background--;
@@ -573,10 +573,9 @@ void fuse_request_send_background_locked(struct fuse_conn *fc,
fc->num_background++;
if (fc->num_background == fc->max_background)
fc->blocked = 1;
- if (fc->num_background == fc->congestion_threshold &&
- fc->bdi_initialized) {
- set_bdi_congested(&fc->bdi, BLK_RW_SYNC);
- set_bdi_congested(&fc->bdi, BLK_RW_ASYNC);
+ if (fc->num_background == fc->congestion_threshold && fc->sb) {
+ set_bdi_congested(fc->sb->s_bdi, BLK_RW_SYNC);
+ set_bdi_congested(fc->sb->s_bdi, BLK_RW_ASYNC);
}
list_add_tail(&req->list, &fc->bg_queue);
flush_bg_queue(fc);
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 32ac2c9b09c0..f33341d9501a 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -527,9 +527,6 @@ struct fuse_conn {
/** Filesystem supports NFS exporting. Only set in INIT */
unsigned export_support:1;
- /** Set if bdi is valid */
- unsigned bdi_initialized:1;
-
/** write-back cache policy (default is write-through) */
unsigned writeback_cache:1;
@@ -631,9 +628,6 @@ struct fuse_conn {
/** Negotiated minor version */
unsigned minor;
- /** Backing dev info */
- struct backing_dev_info bdi;
-
/** Entry on the fuse_conn_list */
struct list_head entry;
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 6fe6a88ecb4a..73cf05135252 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -386,12 +386,6 @@ static void fuse_send_destroy(struct fuse_conn *fc)
}
}
-static void fuse_bdi_destroy(struct fuse_conn *fc)
-{
- if (fc->bdi_initialized)
- bdi_destroy(&fc->bdi);
-}
-
static void fuse_put_super(struct super_block *sb)
{
struct fuse_conn *fc = get_fuse_conn_super(sb);
@@ -403,7 +397,6 @@ static void fuse_put_super(struct super_block *sb)
list_del(&fc->entry);
fuse_ctl_remove_conn(fc);
mutex_unlock(&fuse_mutex);
- fuse_bdi_destroy(fc);
fuse_conn_put(fc);
}
@@ -928,7 +921,8 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
fc->no_flock = 1;
}
- fc->bdi.ra_pages = min(fc->bdi.ra_pages, ra_pages);
+ fc->sb->s_bdi->ra_pages =
+ min(fc->sb->s_bdi->ra_pages, ra_pages);
fc->minor = arg->minor;
fc->max_write = arg->minor < 5 ? 4096 : arg->max_write;
fc->max_write = max_t(unsigned, 4096, fc->max_write);
@@ -944,7 +938,7 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
arg->major = FUSE_KERNEL_VERSION;
arg->minor = FUSE_KERNEL_MINOR_VERSION;
- arg->max_readahead = fc->bdi.ra_pages * PAGE_SIZE;
+ arg->max_readahead = fc->sb->s_bdi->ra_pages * PAGE_SIZE;
arg->flags |= FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC |
FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK |
FUSE_SPLICE_WRITE | FUSE_SPLICE_MOVE | FUSE_SPLICE_READ |
@@ -976,27 +970,18 @@ static void fuse_free_conn(struct fuse_conn *fc)
static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb)
{
int err;
+ char *suffix = "";
- fc->bdi.name = "fuse";
- fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_SIZE;
- /* fuse does it's own writeback accounting */
- fc->bdi.capabilities = BDI_CAP_NO_ACCT_WB | BDI_CAP_STRICTLIMIT;
-
- err = bdi_init(&fc->bdi);
+ if (sb->s_bdev)
+ suffix = "-fuseblk";
+ err = super_setup_bdi_name(sb, "%u:%u%s", MAJOR(fc->dev),
+ MINOR(fc->dev), suffix);
if (err)
return err;
- fc->bdi_initialized = 1;
-
- if (sb->s_bdev) {
- err = bdi_register(&fc->bdi, NULL, "%u:%u-fuseblk",
- MAJOR(fc->dev), MINOR(fc->dev));
- } else {
- err = bdi_register_dev(&fc->bdi, fc->dev);
- }
-
- if (err)
- return err;
+ sb->s_bdi->ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_SIZE;
+ /* fuse does it's own writeback accounting */
+ sb->s_bdi->capabilities = BDI_CAP_NO_ACCT_WB | BDI_CAP_STRICTLIMIT;
/*
* For a single fuse filesystem use max 1% of dirty +
@@ -1010,7 +995,7 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb)
*
* /sys/class/bdi/<bdi>/max_ratio
*/
- bdi_set_max_ratio(&fc->bdi, 1);
+ bdi_set_max_ratio(sb->s_bdi, 1);
return 0;
}
@@ -1113,8 +1098,6 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
if (err)
goto err_dev_free;
- sb->s_bdi = &fc->bdi;
-
/* Handle umasking inside the fuse code */
if (sb->s_flags & MS_POSIXACL)
fc->dont_mask = 1;
@@ -1182,7 +1165,6 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
err_dev_free:
fuse_dev_free(fud);
err_put_conn:
- fuse_bdi_destroy(fc);
fuse_conn_put(fc);
err_fput:
fput(file);
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 01b97c012c6e..3814a60e0aea 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -38,11 +38,6 @@ struct metapath {
__u16 mp_list[GFS2_MAX_META_HEIGHT];
};
-struct strip_mine {
- int sm_first;
- unsigned int sm_height;
-};
-
/**
* gfs2_unstuffer_page - unstuff a stuffed inode into a block cached by a page
* @ip: the inode
@@ -253,6 +248,19 @@ static inline unsigned int metapath_branch_start(const struct metapath *mp)
}
/**
+ * metaptr1 - Return the first possible metadata pointer in a metaath buffer
+ * @height: The metadata height (0 = dinode)
+ * @mp: The metapath
+ */
+static inline __be64 *metaptr1(unsigned int height, const struct metapath *mp)
+{
+ struct buffer_head *bh = mp->mp_bh[height];
+ if (height == 0)
+ return ((__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)));
+ return ((__be64 *)(bh->b_data + sizeof(struct gfs2_meta_header)));
+}
+
+/**
* metapointer - Return pointer to start of metadata in a buffer
* @height: The metadata height (0 = dinode)
* @mp: The metapath
@@ -264,10 +272,8 @@ static inline unsigned int metapath_branch_start(const struct metapath *mp)
static inline __be64 *metapointer(unsigned int height, const struct metapath *mp)
{
- struct buffer_head *bh = mp->mp_bh[height];
- unsigned int head_size = (height > 0) ?
- sizeof(struct gfs2_meta_header) : sizeof(struct gfs2_dinode);
- return ((__be64 *)(bh->b_data + head_size)) + mp->mp_list[height];
+ __be64 *p = metaptr1(height, mp);
+ return p + mp->mp_list[height];
}
static void gfs2_metapath_ra(struct gfs2_glock *gl,
@@ -296,6 +302,23 @@ static void gfs2_metapath_ra(struct gfs2_glock *gl,
}
/**
+ * lookup_mp_height - helper function for lookup_metapath
+ * @ip: the inode
+ * @mp: the metapath
+ * @h: the height which needs looking up
+ */
+static int lookup_mp_height(struct gfs2_inode *ip, struct metapath *mp, int h)
+{
+ __be64 *ptr = metapointer(h, mp);
+ u64 dblock = be64_to_cpu(*ptr);
+
+ if (!dblock)
+ return h + 1;
+
+ return gfs2_meta_indirect_buffer(ip, h + 1, dblock, &mp->mp_bh[h + 1]);
+}
+
+/**
* lookup_metapath - Walk the metadata tree to a specific point
* @ip: The inode
* @mp: The metapath
@@ -316,17 +339,10 @@ static int lookup_metapath(struct gfs2_inode *ip, struct metapath *mp)
{
unsigned int end_of_metadata = ip->i_height - 1;
unsigned int x;
- __be64 *ptr;
- u64 dblock;
int ret;
for (x = 0; x < end_of_metadata; x++) {
- ptr = metapointer(x, mp);
- dblock = be64_to_cpu(*ptr);
- if (!dblock)
- return x + 1;
-
- ret = gfs2_meta_indirect_buffer(ip, x+1, dblock, &mp->mp_bh[x+1]);
+ ret = lookup_mp_height(ip, mp, x);
if (ret)
return ret;
}
@@ -334,6 +350,35 @@ static int lookup_metapath(struct gfs2_inode *ip, struct metapath *mp)
return ip->i_height;
}
+/**
+ * fillup_metapath - fill up buffers for the metadata path to a specific height
+ * @ip: The inode
+ * @mp: The metapath
+ * @h: The height to which it should be mapped
+ *
+ * Similar to lookup_metapath, but does lookups for a range of heights
+ *
+ * Returns: error or height of metadata tree
+ */
+
+static int fillup_metapath(struct gfs2_inode *ip, struct metapath *mp, int h)
+{
+ unsigned int start_h = h - 1;
+ int ret;
+
+ if (h) {
+ /* find the first buffer we need to look up. */
+ while (start_h > 0 && mp->mp_bh[start_h] == NULL)
+ start_h--;
+ for (; start_h < h; start_h++) {
+ ret = lookup_mp_height(ip, mp, start_h);
+ if (ret)
+ return ret;
+ }
+ }
+ return ip->i_height;
+}
+
static inline void release_metapath(struct metapath *mp)
{
int i;
@@ -422,6 +467,13 @@ enum alloc_state {
/* ALLOC_UNSTUFF = 3, TBD and rather complicated */
};
+static inline unsigned int hptrs(struct gfs2_sbd *sdp, const unsigned int hgt)
+{
+ if (hgt)
+ return sdp->sd_inptrs;
+ return sdp->sd_diptrs;
+}
+
/**
* gfs2_bmap_alloc - Build a metadata tree of the requested height
* @inode: The GFS2 inode
@@ -620,7 +672,7 @@ int gfs2_block_map(struct inode *inode, sector_t lblock,
BUG_ON(maxlen == 0);
- memset(mp.mp_bh, 0, sizeof(mp.mp_bh));
+ memset(&mp, 0, sizeof(mp));
bmap_lock(ip, create);
clear_buffer_mapped(bh_map);
clear_buffer_new(bh_map);
@@ -702,252 +754,6 @@ int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsi
}
/**
- * do_strip - Look for a layer a particular layer of the file and strip it off
- * @ip: the inode
- * @dibh: the dinode buffer
- * @bh: A buffer of pointers
- * @top: The first pointer in the buffer
- * @bottom: One more than the last pointer
- * @height: the height this buffer is at
- * @sm: a pointer to a struct strip_mine
- *
- * Returns: errno
- */
-
-static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
- struct buffer_head *bh, __be64 *top, __be64 *bottom,
- unsigned int height, struct strip_mine *sm)
-{
- struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
- struct gfs2_rgrp_list rlist;
- struct gfs2_trans *tr;
- u64 bn, bstart;
- u32 blen, btotal;
- __be64 *p;
- unsigned int rg_blocks = 0;
- int metadata;
- unsigned int revokes = 0;
- int x;
- int error;
- int jblocks_rqsted;
-
- error = gfs2_rindex_update(sdp);
- if (error)
- return error;
-
- if (!*top)
- sm->sm_first = 0;
-
- if (height != sm->sm_height)
- return 0;
-
- if (sm->sm_first) {
- top++;
- sm->sm_first = 0;
- }
-
- metadata = (height != ip->i_height - 1);
- if (metadata)
- revokes = (height) ? sdp->sd_inptrs : sdp->sd_diptrs;
- else if (ip->i_depth)
- revokes = sdp->sd_inptrs;
-
- memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
- bstart = 0;
- blen = 0;
-
- for (p = top; p < bottom; p++) {
- if (!*p)
- continue;
-
- bn = be64_to_cpu(*p);
-
- if (bstart + blen == bn)
- blen++;
- else {
- if (bstart)
- gfs2_rlist_add(ip, &rlist, bstart);
-
- bstart = bn;
- blen = 1;
- }
- }
-
- if (bstart)
- gfs2_rlist_add(ip, &rlist, bstart);
- else
- goto out; /* Nothing to do */
-
- gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE);
-
- for (x = 0; x < rlist.rl_rgrps; x++) {
- struct gfs2_rgrpd *rgd;
- rgd = rlist.rl_ghs[x].gh_gl->gl_object;
- rg_blocks += rgd->rd_length;
- }
-
- error = gfs2_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs);
- if (error)
- goto out_rlist;
-
- if (gfs2_rs_active(&ip->i_res)) /* needs to be done with the rgrp glock held */
- gfs2_rs_deltree(&ip->i_res);
-
-restart:
- jblocks_rqsted = rg_blocks + RES_DINODE +
- RES_INDIRECT + RES_STATFS + RES_QUOTA +
- gfs2_struct2blk(sdp, revokes, sizeof(u64));
- if (jblocks_rqsted > atomic_read(&sdp->sd_log_thresh2))
- jblocks_rqsted = atomic_read(&sdp->sd_log_thresh2);
- error = gfs2_trans_begin(sdp, jblocks_rqsted, revokes);
- if (error)
- goto out_rg_gunlock;
-
- tr = current->journal_info;
- down_write(&ip->i_rw_mutex);
-
- gfs2_trans_add_meta(ip->i_gl, dibh);
- gfs2_trans_add_meta(ip->i_gl, bh);
-
- bstart = 0;
- blen = 0;
- btotal = 0;
-
- for (p = top; p < bottom; p++) {
- if (!*p)
- continue;
-
- /* check for max reasonable journal transaction blocks */
- if (tr->tr_num_buf_new + RES_STATFS +
- RES_QUOTA >= atomic_read(&sdp->sd_log_thresh2)) {
- if (rg_blocks >= tr->tr_num_buf_new)
- rg_blocks -= tr->tr_num_buf_new;
- else
- rg_blocks = 0;
- break;
- }
-
- bn = be64_to_cpu(*p);
-
- if (bstart + blen == bn)
- blen++;
- else {
- if (bstart) {
- __gfs2_free_blocks(ip, bstart, blen, metadata);
- btotal += blen;
- }
-
- bstart = bn;
- blen = 1;
- }
-
- *p = 0;
- gfs2_add_inode_blocks(&ip->i_inode, -1);
- }
- if (p == bottom)
- rg_blocks = 0;
-
- if (bstart) {
- __gfs2_free_blocks(ip, bstart, blen, metadata);
- btotal += blen;
- }
-
- gfs2_statfs_change(sdp, 0, +btotal, 0);
- gfs2_quota_change(ip, -(s64)btotal, ip->i_inode.i_uid,
- ip->i_inode.i_gid);
-
- ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
-
- gfs2_dinode_out(ip, dibh->b_data);
-
- up_write(&ip->i_rw_mutex);
-
- gfs2_trans_end(sdp);
-
- if (rg_blocks)
- goto restart;
-
-out_rg_gunlock:
- gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs);
-out_rlist:
- gfs2_rlist_free(&rlist);
-out:
- return error;
-}
-
-/**
- * recursive_scan - recursively scan through the end of a file
- * @ip: the inode
- * @dibh: the dinode buffer
- * @mp: the path through the metadata to the point to start
- * @height: the height the recursion is at
- * @block: the indirect block to look at
- * @first: 1 if this is the first block
- * @sm: data opaque to this function to pass to @bc
- *
- * When this is first called @height and @block should be zero and
- * @first should be 1.
- *
- * Returns: errno
- */
-
-static int recursive_scan(struct gfs2_inode *ip, struct buffer_head *dibh,
- struct metapath *mp, unsigned int height,
- u64 block, int first, struct strip_mine *sm)
-{
- struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
- struct buffer_head *bh = NULL;
- __be64 *top, *bottom;
- u64 bn;
- int error;
- int mh_size = sizeof(struct gfs2_meta_header);
-
- if (!height) {
- error = gfs2_meta_inode_buffer(ip, &bh);
- if (error)
- return error;
- dibh = bh;
-
- top = (__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + mp->mp_list[0];
- bottom = (__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + sdp->sd_diptrs;
- } else {
- error = gfs2_meta_indirect_buffer(ip, height, block, &bh);
- if (error)
- return error;
-
- top = (__be64 *)(bh->b_data + mh_size) +
- (first ? mp->mp_list[height] : 0);
-
- bottom = (__be64 *)(bh->b_data + mh_size) + sdp->sd_inptrs;
- }
-
- error = do_strip(ip, dibh, bh, top, bottom, height, sm);
- if (error)
- goto out;
-
- if (height < ip->i_height - 1) {
-
- gfs2_metapath_ra(ip->i_gl, bh, top);
-
- for (; top < bottom; top++, first = 0) {
- if (!*top)
- continue;
-
- bn = be64_to_cpu(*top);
-
- error = recursive_scan(ip, dibh, mp, height + 1, bn,
- first, sm);
- if (error)
- break;
- }
- }
-out:
- brelse(bh);
- return error;
-}
-
-
-/**
* gfs2_block_truncate_page - Deal with zeroing out data for truncate
*
* This is partly borrowed from ext3.
@@ -1106,41 +912,406 @@ out:
return error;
}
-static int trunc_dealloc(struct gfs2_inode *ip, u64 size)
+/**
+ * sweep_bh_for_rgrps - find an rgrp in a meta buffer and free blocks therein
+ * @ip: inode
+ * @rg_gh: holder of resource group glock
+ * @mp: current metapath fully populated with buffers
+ * @btotal: place to keep count of total blocks freed
+ * @hgt: height we're processing
+ * @first: true if this is the first call to this function for this height
+ *
+ * We sweep a metadata buffer (provided by the metapath) for blocks we need to
+ * free, and free them all. However, we do it one rgrp at a time. If this
+ * block has references to multiple rgrps, we break it into individual
+ * transactions. This allows other processes to use the rgrps while we're
+ * focused on a single one, for better concurrency / performance.
+ * At every transaction boundary, we rewrite the inode into the journal.
+ * That way the bitmaps are kept consistent with the inode and we can recover
+ * if we're interrupted by power-outages.
+ *
+ * Returns: 0, or return code if an error occurred.
+ * *btotal has the total number of blocks freed
+ */
+static int sweep_bh_for_rgrps(struct gfs2_inode *ip, struct gfs2_holder *rd_gh,
+ const struct metapath *mp, u32 *btotal, int hgt,
+ bool preserve1)
{
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
- unsigned int height = ip->i_height;
- u64 lblock;
- struct metapath mp;
- int error;
+ struct gfs2_rgrpd *rgd;
+ struct gfs2_trans *tr;
+ struct buffer_head *bh = mp->mp_bh[hgt];
+ __be64 *top, *bottom, *p;
+ int blks_outside_rgrp;
+ u64 bn, bstart, isize_blks;
+ s64 blen; /* needs to be s64 or gfs2_add_inode_blocks breaks */
+ int meta = ((hgt != ip->i_height - 1) ? 1 : 0);
+ int ret = 0;
+ bool buf_in_tr = false; /* buffer was added to transaction */
+
+ if (gfs2_metatype_check(sdp, bh,
+ (hgt ? GFS2_METATYPE_IN : GFS2_METATYPE_DI)))
+ return -EIO;
+
+more_rgrps:
+ blks_outside_rgrp = 0;
+ bstart = 0;
+ blen = 0;
+ top = metapointer(hgt, mp); /* first ptr from metapath */
+ /* If we're keeping some data at the truncation point, we've got to
+ preserve the metadata tree by adding 1 to the starting metapath. */
+ if (preserve1)
+ top++;
+
+ bottom = (__be64 *)(bh->b_data + bh->b_size);
+
+ for (p = top; p < bottom; p++) {
+ if (!*p)
+ continue;
+ bn = be64_to_cpu(*p);
+ if (gfs2_holder_initialized(rd_gh)) {
+ rgd = (struct gfs2_rgrpd *)rd_gh->gh_gl->gl_object;
+ gfs2_assert_withdraw(sdp,
+ gfs2_glock_is_locked_by_me(rd_gh->gh_gl));
+ } else {
+ rgd = gfs2_blk2rgrpd(sdp, bn, false);
+ ret = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE,
+ 0, rd_gh);
+ if (ret)
+ goto out;
+
+ /* Must be done with the rgrp glock held: */
+ if (gfs2_rs_active(&ip->i_res) &&
+ rgd == ip->i_res.rs_rbm.rgd)
+ gfs2_rs_deltree(&ip->i_res);
+ }
+
+ if (!rgrp_contains_block(rgd, bn)) {
+ blks_outside_rgrp++;
+ continue;
+ }
+
+ /* The size of our transactions will be unknown until we
+ actually process all the metadata blocks that relate to
+ the rgrp. So we estimate. We know it can't be more than
+ the dinode's i_blocks and we don't want to exceed the
+ journal flush threshold, sd_log_thresh2. */
+ if (current->journal_info == NULL) {
+ unsigned int jblocks_rqsted, revokes;
+
+ jblocks_rqsted = rgd->rd_length + RES_DINODE +
+ RES_INDIRECT;
+ isize_blks = gfs2_get_inode_blocks(&ip->i_inode);
+ if (isize_blks > atomic_read(&sdp->sd_log_thresh2))
+ jblocks_rqsted +=
+ atomic_read(&sdp->sd_log_thresh2);
+ else
+ jblocks_rqsted += isize_blks;
+ revokes = jblocks_rqsted;
+ if (meta)
+ revokes += hptrs(sdp, hgt);
+ else if (ip->i_depth)
+ revokes += sdp->sd_inptrs;
+ ret = gfs2_trans_begin(sdp, jblocks_rqsted, revokes);
+ if (ret)
+ goto out_unlock;
+ down_write(&ip->i_rw_mutex);
+ }
+ /* check if we will exceed the transaction blocks requested */
+ tr = current->journal_info;
+ if (tr->tr_num_buf_new + RES_STATFS +
+ RES_QUOTA >= atomic_read(&sdp->sd_log_thresh2)) {
+ /* We set blks_outside_rgrp to ensure the loop will
+ be repeated for the same rgrp, but with a new
+ transaction. */
+ blks_outside_rgrp++;
+ /* This next part is tricky. If the buffer was added
+ to the transaction, we've already set some block
+ pointers to 0, so we better follow through and free
+ them, or we will introduce corruption (so break).
+ This may be impossible, or at least rare, but I
+ decided to cover the case regardless.
+
+ If the buffer was not added to the transaction
+ (this call), doing so would exceed our transaction
+ size, so we need to end the transaction and start a
+ new one (so goto). */
+
+ if (buf_in_tr)
+ break;
+ goto out_unlock;
+ }
+
+ gfs2_trans_add_meta(ip->i_gl, bh);
+ buf_in_tr = true;
+ *p = 0;
+ if (bstart + blen == bn) {
+ blen++;
+ continue;
+ }
+ if (bstart) {
+ __gfs2_free_blocks(ip, bstart, (u32)blen, meta);
+ (*btotal) += blen;
+ gfs2_add_inode_blocks(&ip->i_inode, -blen);
+ }
+ bstart = bn;
+ blen = 1;
+ }
+ if (bstart) {
+ __gfs2_free_blocks(ip, bstart, (u32)blen, meta);
+ (*btotal) += blen;
+ gfs2_add_inode_blocks(&ip->i_inode, -blen);
+ }
+out_unlock:
+ if (!ret && blks_outside_rgrp) { /* If buffer still has non-zero blocks
+ outside the rgrp we just processed,
+ do it all over again. */
+ if (current->journal_info) {
+ struct buffer_head *dibh = mp->mp_bh[0];
+
+ /* Every transaction boundary, we rewrite the dinode
+ to keep its di_blocks current in case of failure. */
+ ip->i_inode.i_mtime = ip->i_inode.i_ctime =
+ CURRENT_TIME;
+ gfs2_trans_add_meta(ip->i_gl, dibh);
+ gfs2_dinode_out(ip, dibh->b_data);
+ up_write(&ip->i_rw_mutex);
+ gfs2_trans_end(sdp);
+ }
+ gfs2_glock_dq_uninit(rd_gh);
+ cond_resched();
+ goto more_rgrps;
+ }
+out:
+ return ret;
+}
+
+/**
+ * find_nonnull_ptr - find a non-null pointer given a metapath and height
+ * assumes the metapath is valid (with buffers) out to height h
+ * @mp: starting metapath
+ * @h: desired height to search
+ *
+ * Returns: true if a non-null pointer was found in the metapath buffer
+ * false if all remaining pointers are NULL in the buffer
+ */
+static bool find_nonnull_ptr(struct gfs2_sbd *sdp, struct metapath *mp,
+ unsigned int h)
+{
+ __be64 *ptr;
+ unsigned int ptrs = hptrs(sdp, h) - 1;
+
+ while (true) {
+ ptr = metapointer(h, mp);
+ if (*ptr) /* if we have a non-null pointer */
+ return true;
+
+ if (mp->mp_list[h] < ptrs)
+ mp->mp_list[h]++;
+ else
+ return false; /* no more pointers in this buffer */
+ }
+}
+
+enum dealloc_states {
+ DEALLOC_MP_FULL = 0, /* Strip a metapath with all buffers read in */
+ DEALLOC_MP_LOWER = 1, /* lower the metapath strip height */
+ DEALLOC_FILL_MP = 2, /* Fill in the metapath to the given height. */
+ DEALLOC_DONE = 3, /* process complete */
+};
- if (!size)
+/**
+ * trunc_dealloc - truncate a file down to a desired size
+ * @ip: inode to truncate
+ * @newsize: The desired size of the file
+ *
+ * This function truncates a file to newsize. It works from the
+ * bottom up, and from the right to the left. In other words, it strips off
+ * the highest layer (data) before stripping any of the metadata. Doing it
+ * this way is best in case the operation is interrupted by power failure, etc.
+ * The dinode is rewritten in every transaction to guarantee integrity.
+ */
+static int trunc_dealloc(struct gfs2_inode *ip, u64 newsize)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+ struct metapath mp;
+ struct buffer_head *dibh, *bh;
+ struct gfs2_holder rd_gh;
+ u64 lblock;
+ __u16 nbof[GFS2_MAX_META_HEIGHT]; /* new beginning of truncation */
+ unsigned int strip_h = ip->i_height - 1;
+ u32 btotal = 0;
+ int ret, state;
+ int mp_h; /* metapath buffers are read in to this height */
+ sector_t last_ra = 0;
+ u64 prev_bnr = 0;
+ bool preserve1; /* need to preserve the first meta pointer? */
+
+ if (!newsize)
lblock = 0;
else
- lblock = (size - 1) >> sdp->sd_sb.sb_bsize_shift;
+ lblock = (newsize - 1) >> sdp->sd_sb.sb_bsize_shift;
+ memset(&mp, 0, sizeof(mp));
find_metapath(sdp, lblock, &mp, ip->i_height);
- error = gfs2_rindex_update(sdp);
- if (error)
- return error;
- error = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE);
- if (error)
- return error;
+ memcpy(&nbof, &mp.mp_list, sizeof(nbof));
+
+ ret = gfs2_meta_inode_buffer(ip, &dibh);
+ if (ret)
+ return ret;
- while (height--) {
- struct strip_mine sm;
- sm.sm_first = !!size;
- sm.sm_height = height;
+ mp.mp_bh[0] = dibh;
+ ret = lookup_metapath(ip, &mp);
+ if (ret == ip->i_height)
+ state = DEALLOC_MP_FULL; /* We have a complete metapath */
+ else
+ state = DEALLOC_FILL_MP; /* deal with partial metapath */
- error = recursive_scan(ip, NULL, &mp, 0, 0, 1, &sm);
- if (error)
+ ret = gfs2_rindex_update(sdp);
+ if (ret)
+ goto out_metapath;
+
+ ret = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE);
+ if (ret)
+ goto out_metapath;
+ gfs2_holder_mark_uninitialized(&rd_gh);
+
+ mp_h = strip_h;
+
+ while (state != DEALLOC_DONE) {
+ switch (state) {
+ /* Truncate a full metapath at the given strip height.
+ * Note that strip_h == mp_h in order to be in this state. */
+ case DEALLOC_MP_FULL:
+ if (mp_h > 0) { /* issue read-ahead on metadata */
+ __be64 *top;
+
+ bh = mp.mp_bh[mp_h - 1];
+ if (bh->b_blocknr != last_ra) {
+ last_ra = bh->b_blocknr;
+ top = metaptr1(mp_h - 1, &mp);
+ gfs2_metapath_ra(ip->i_gl, bh, top);
+ }
+ }
+ /* If we're truncating to a non-zero size and the mp is
+ at the beginning of file for the strip height, we
+ need to preserve the first metadata pointer. */
+ preserve1 = (newsize &&
+ (mp.mp_list[mp_h] == nbof[mp_h]));
+ bh = mp.mp_bh[mp_h];
+ gfs2_assert_withdraw(sdp, bh);
+ if (gfs2_assert_withdraw(sdp,
+ prev_bnr != bh->b_blocknr)) {
+ printk(KERN_EMERG "GFS2: fsid=%s:inode %llu, "
+ "block:%llu, i_h:%u, s_h:%u, mp_h:%u\n",
+ sdp->sd_fsname,
+ (unsigned long long)ip->i_no_addr,
+ prev_bnr, ip->i_height, strip_h, mp_h);
+ }
+ prev_bnr = bh->b_blocknr;
+ ret = sweep_bh_for_rgrps(ip, &rd_gh, &mp, &btotal,
+ mp_h, preserve1);
+ /* If we hit an error or just swept dinode buffer,
+ just exit. */
+ if (ret || !mp_h) {
+ state = DEALLOC_DONE;
+ break;
+ }
+ state = DEALLOC_MP_LOWER;
+ break;
+
+ /* lower the metapath strip height */
+ case DEALLOC_MP_LOWER:
+ /* We're done with the current buffer, so release it,
+ unless it's the dinode buffer. Then back up to the
+ previous pointer. */
+ if (mp_h) {
+ brelse(mp.mp_bh[mp_h]);
+ mp.mp_bh[mp_h] = NULL;
+ }
+ /* If we can't get any lower in height, we've stripped
+ off all we can. Next step is to back up and start
+ stripping the previous level of metadata. */
+ if (mp_h == 0) {
+ strip_h--;
+ memcpy(&mp.mp_list, &nbof, sizeof(nbof));
+ mp_h = strip_h;
+ state = DEALLOC_FILL_MP;
+ break;
+ }
+ mp.mp_list[mp_h] = 0;
+ mp_h--; /* search one metadata height down */
+ if (mp.mp_list[mp_h] >= hptrs(sdp, mp_h) - 1)
+ break; /* loop around in the same state */
+ mp.mp_list[mp_h]++;
+ /* Here we've found a part of the metapath that is not
+ * allocated. We need to search at that height for the
+ * next non-null pointer. */
+ if (find_nonnull_ptr(sdp, &mp, mp_h)) {
+ state = DEALLOC_FILL_MP;
+ mp_h++;
+ }
+ /* No more non-null pointers at this height. Back up
+ to the previous height and try again. */
+ break; /* loop around in the same state */
+
+ /* Fill the metapath with buffers to the given height. */
+ case DEALLOC_FILL_MP:
+ /* Fill the buffers out to the current height. */
+ ret = fillup_metapath(ip, &mp, mp_h);
+ if (ret < 0)
+ goto out;
+
+ /* If buffers found for the entire strip height */
+ if ((ret == ip->i_height) && (mp_h == strip_h)) {
+ state = DEALLOC_MP_FULL;
+ break;
+ }
+ if (ret < ip->i_height) /* We have a partial height */
+ mp_h = ret - 1;
+
+ /* If we find a non-null block pointer, crawl a bit
+ higher up in the metapath and try again, otherwise
+ we need to look lower for a new starting point. */
+ if (find_nonnull_ptr(sdp, &mp, mp_h))
+ mp_h++;
+ else
+ state = DEALLOC_MP_LOWER;
break;
+ }
}
- gfs2_quota_unhold(ip);
+ if (btotal) {
+ if (current->journal_info == NULL) {
+ ret = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS +
+ RES_QUOTA, 0);
+ if (ret)
+ goto out;
+ down_write(&ip->i_rw_mutex);
+ }
+ gfs2_statfs_change(sdp, 0, +btotal, 0);
+ gfs2_quota_change(ip, -(s64)btotal, ip->i_inode.i_uid,
+ ip->i_inode.i_gid);
+ ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
+ gfs2_trans_add_meta(ip->i_gl, dibh);
+ gfs2_dinode_out(ip, dibh->b_data);
+ up_write(&ip->i_rw_mutex);
+ gfs2_trans_end(sdp);
+ }
- return error;
+out:
+ if (gfs2_holder_initialized(&rd_gh))
+ gfs2_glock_dq_uninit(&rd_gh);
+ if (current->journal_info) {
+ up_write(&ip->i_rw_mutex);
+ gfs2_trans_end(sdp);
+ cond_resched();
+ }
+ gfs2_quota_unhold(ip);
+out_metapath:
+ release_metapath(&mp);
+ return ret;
}
static int trunc_end(struct gfs2_inode *ip)
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 6fe2a59c6a9a..c2062a108d19 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -911,11 +911,15 @@ out_qunlock:
static long gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
{
struct inode *inode = file_inode(file);
+ struct gfs2_sbd *sdp = GFS2_SB(inode);
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_holder gh;
int ret;
- if ((mode & ~FALLOC_FL_KEEP_SIZE) || gfs2_is_jdata(ip))
+ if (mode & ~FALLOC_FL_KEEP_SIZE)
+ return -EOPNOTSUPP;
+ /* fallocate is needed by gfs2_grow to reserve space in the rindex */
+ if (gfs2_is_jdata(ip) && inode != sdp->sd_rindex)
return -EOPNOTSUPP;
inode_lock(inode);
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index ec0848fcca02..959a19ced4d5 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -73,7 +73,7 @@ static DEFINE_SPINLOCK(lru_lock);
static struct rhashtable_params ht_parms = {
.nelem_hint = GFS2_GL_HASH_SIZE * 3 / 4,
- .key_len = sizeof(struct lm_lockname),
+ .key_len = offsetofend(struct lm_lockname, ln_type),
.key_offset = offsetof(struct gfs2_glock, gl_name),
.head_offset = offsetof(struct gfs2_glock, gl_node),
};
@@ -449,6 +449,9 @@ __acquires(&gl->gl_lockref.lock)
unsigned int lck_flags = (unsigned int)(gh ? gh->gh_flags : 0);
int ret;
+ if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) &&
+ target != LM_ST_UNLOCKED)
+ return;
lck_flags &= (LM_FLAG_TRY | LM_FLAG_TRY_1CB | LM_FLAG_NOEXP |
LM_FLAG_PRIORITY);
GLOCK_BUG_ON(gl, gl->gl_state == target);
@@ -484,7 +487,8 @@ __acquires(&gl->gl_lockref.lock)
}
else if (ret) {
pr_err("lm_lock ret %d\n", ret);
- GLOCK_BUG_ON(gl, 1);
+ GLOCK_BUG_ON(gl, !test_bit(SDF_SHUTDOWN,
+ &sdp->sd_flags));
}
} else { /* lock_nolock */
finish_xmote(gl, target);
@@ -653,10 +657,10 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
struct lm_lockname name = { .ln_number = number,
.ln_type = glops->go_type,
.ln_sbd = sdp };
- struct gfs2_glock *gl, *tmp = NULL;
+ struct gfs2_glock *gl, *tmp;
struct address_space *mapping;
struct kmem_cache *cachep;
- int ret, tries = 0;
+ int ret = 0;
rcu_read_lock();
gl = rhashtable_lookup_fast(&gl_hash_table, &name, ht_parms);
@@ -721,35 +725,32 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
}
again:
- ret = rhashtable_lookup_insert_fast(&gl_hash_table, &gl->gl_node,
- ht_parms);
- if (ret == 0) {
+ rcu_read_lock();
+ tmp = rhashtable_lookup_get_insert_fast(&gl_hash_table, &gl->gl_node,
+ ht_parms);
+ if (!tmp) {
*glp = gl;
- return 0;
+ goto out;
}
-
- if (ret == -EEXIST) {
- ret = 0;
- rcu_read_lock();
- tmp = rhashtable_lookup_fast(&gl_hash_table, &name, ht_parms);
- if (tmp == NULL || !lockref_get_not_dead(&tmp->gl_lockref)) {
- if (++tries < 100) {
- rcu_read_unlock();
- cond_resched();
- goto again;
- }
- tmp = NULL;
- ret = -ENOMEM;
- }
- rcu_read_unlock();
- } else {
- WARN_ON_ONCE(ret);
+ if (IS_ERR(tmp)) {
+ ret = PTR_ERR(tmp);
+ goto out_free;
}
+ if (lockref_get_not_dead(&tmp->gl_lockref)) {
+ *glp = tmp;
+ goto out_free;
+ }
+ rcu_read_unlock();
+ cond_resched();
+ goto again;
+
+out_free:
kfree(gl->gl_lksb.sb_lvbptr);
kmem_cache_free(cachep, gl);
atomic_dec(&sdp->sd_glock_disposal);
- *glp = tmp;
+out:
+ rcu_read_unlock();
return ret;
}
@@ -1918,10 +1919,10 @@ static const struct seq_operations gfs2_sbstats_seq_ops = {
#define GFS2_SEQ_GOODSIZE min(PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER, 65536UL)
-static int gfs2_glocks_open(struct inode *inode, struct file *file)
+static int __gfs2_glocks_open(struct inode *inode, struct file *file,
+ const struct seq_operations *ops)
{
- int ret = seq_open_private(file, &gfs2_glock_seq_ops,
- sizeof(struct gfs2_glock_iter));
+ int ret = seq_open_private(file, ops, sizeof(struct gfs2_glock_iter));
if (ret == 0) {
struct seq_file *seq = file->private_data;
struct gfs2_glock_iter *gi = seq->private;
@@ -1932,11 +1933,16 @@ static int gfs2_glocks_open(struct inode *inode, struct file *file)
if (seq->buf)
seq->size = GFS2_SEQ_GOODSIZE;
gi->gl = NULL;
- ret = rhashtable_walk_init(&gl_hash_table, &gi->hti, GFP_KERNEL);
+ rhashtable_walk_enter(&gl_hash_table, &gi->hti);
}
return ret;
}
+static int gfs2_glocks_open(struct inode *inode, struct file *file)
+{
+ return __gfs2_glocks_open(inode, file, &gfs2_glock_seq_ops);
+}
+
static int gfs2_glocks_release(struct inode *inode, struct file *file)
{
struct seq_file *seq = file->private_data;
@@ -1949,20 +1955,7 @@ static int gfs2_glocks_release(struct inode *inode, struct file *file)
static int gfs2_glstats_open(struct inode *inode, struct file *file)
{
- int ret = seq_open_private(file, &gfs2_glstats_seq_ops,
- sizeof(struct gfs2_glock_iter));
- if (ret == 0) {
- struct seq_file *seq = file->private_data;
- struct gfs2_glock_iter *gi = seq->private;
- gi->sdp = inode->i_private;
- gi->last_pos = 0;
- seq->buf = kmalloc(GFS2_SEQ_GOODSIZE, GFP_KERNEL | __GFP_NOWARN);
- if (seq->buf)
- seq->size = GFS2_SEQ_GOODSIZE;
- gi->gl = NULL;
- ret = rhashtable_walk_init(&gl_hash_table, &gi->hti, GFP_KERNEL);
- }
- return ret;
+ return __gfs2_glocks_open(inode, file, &gfs2_glstats_seq_ops);
}
static int gfs2_sbstats_open(struct inode *inode, struct file *file)
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 511e1ed7e2de..b7cf65d13561 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -203,11 +203,15 @@ enum {
DFL_DLM_RECOVERY = 6,
};
+/*
+ * We are using struct lm_lockname as an rhashtable key. Avoid holes within
+ * the struct; padding at the end is fine.
+ */
struct lm_lockname {
- struct gfs2_sbd *ln_sbd;
u64 ln_number;
+ struct gfs2_sbd *ln_sbd;
unsigned int ln_type;
-} __packed __aligned(sizeof(int));
+};
#define lm_name_equal(name1, name2) \
(((name1)->ln_number == (name2)->ln_number) && \
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index e279c3ce27be..9f605ea4810c 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -202,8 +202,7 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
fail_refresh:
ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
ip->i_iopen_gh.gh_gl->gl_object = NULL;
- gfs2_glock_dq_wait(&ip->i_iopen_gh);
- gfs2_holder_uninit(&ip->i_iopen_gh);
+ gfs2_glock_dq_uninit(&ip->i_iopen_gh);
fail_put:
if (io_gl)
gfs2_glock_put(io_gl);
@@ -667,6 +666,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
ip->i_height = 0;
ip->i_depth = 0;
ip->i_entries = 0;
+ ip->i_no_addr = 0; /* Temporarily zero until real addr is assigned */
switch(mode & S_IFMT) {
case S_IFREG:
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index b108e7ba81af..ed67548b286c 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -23,6 +23,7 @@
#include <linux/quotaops.h>
#include <linux/lockdep.h>
#include <linux/module.h>
+#include <linux/backing-dev.h>
#include "gfs2.h"
#include "incore.h"
@@ -1222,12 +1223,7 @@ static int set_gfs2_super(struct super_block *s, void *data)
{
s->s_bdev = data;
s->s_dev = s->s_bdev->bd_dev;
-
- /*
- * We set the bdi here to the queue backing, file systems can
- * overwrite this in ->fill_super()
- */
- s->s_bdi = bdev_get_queue(s->s_bdev)->backing_dev_info;
+ s->s_bdi = bdi_get(s->s_bdev->bd_bdi);
return 0;
}
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 86ccc0159393..83c9909ff14a 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -483,13 +483,6 @@ void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd)
}
}
-static inline int rgrp_contains_block(struct gfs2_rgrpd *rgd, u64 block)
-{
- u64 first = rgd->rd_data0;
- u64 last = first + rgd->rd_data;
- return first <= block && block < last;
-}
-
/**
* gfs2_blk2rgrpd - Find resource group for a given data/meta block number
* @sdp: The GFS2 superblock
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index 66b51cf66dfa..e90478e2f545 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -83,5 +83,12 @@ static inline bool gfs2_rs_active(const struct gfs2_blkreserv *rs)
return rs && !RB_EMPTY_NODE(&rs->rs_node);
}
+static inline int rgrp_contains_block(struct gfs2_rgrpd *rgd, u64 block)
+{
+ u64 first = rgd->rd_data0;
+ u64 last = first + rgd->rd_data;
+ return first <= block && block < last;
+}
+
extern void check_and_update_goal(struct gfs2_inode *ip);
#endif /* __RGRP_DOT_H__ */
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 361796a84fce..29b0473f6e74 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -793,7 +793,8 @@ static void gfs2_dirty_inode(struct inode *inode, int flags)
if (!(flags & (I_DIRTY_DATASYNC|I_DIRTY_SYNC)))
return;
-
+ if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+ return;
if (!gfs2_glock_is_locked_by_me(ip->i_gl)) {
ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
if (ret) {
@@ -1538,8 +1539,7 @@ static void gfs2_evict_inode(struct inode *inode)
error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, &gh);
if (unlikely(error)) {
ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
- gfs2_glock_dq_wait(&ip->i_iopen_gh);
- gfs2_holder_uninit(&ip->i_iopen_gh);
+ gfs2_glock_dq_uninit(&ip->i_iopen_gh);
goto out;
}
@@ -1617,7 +1617,7 @@ out_unlock:
if (gfs2_holder_initialized(&ip->i_iopen_gh)) {
if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) {
ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
- gfs2_glock_dq_wait(&ip->i_iopen_gh);
+ gfs2_glock_dq(&ip->i_iopen_gh);
}
gfs2_holder_uninit(&ip->i_iopen_gh);
}
@@ -1639,8 +1639,7 @@ out:
if (gfs2_holder_initialized(&ip->i_iopen_gh)) {
ip->i_iopen_gh.gh_gl->gl_object = NULL;
ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
- gfs2_glock_dq_wait(&ip->i_iopen_gh);
- gfs2_holder_uninit(&ip->i_iopen_gh);
+ gfs2_glock_dq_uninit(&ip->i_iopen_gh);
}
}
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 7163fe014b57..dde861387a40 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -136,17 +136,26 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
vma->vm_flags |= VM_HUGETLB | VM_DONTEXPAND;
vma->vm_ops = &hugetlb_vm_ops;
+ /*
+ * Offset passed to mmap (before page shift) could have been
+ * negative when represented as a (l)off_t.
+ */
+ if (((loff_t)vma->vm_pgoff << PAGE_SHIFT) < 0)
+ return -EINVAL;
+
if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT))
return -EINVAL;
vma_len = (loff_t)(vma->vm_end - vma->vm_start);
+ len = vma_len + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
+ /* check for overflow */
+ if (len < vma_len)
+ return -EINVAL;
inode_lock(inode);
file_accessed(file);
ret = -ENOMEM;
- len = vma_len + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
-
if (hugetlb_reserve_pages(inode,
vma->vm_pgoff >> huge_page_order(h),
len >> huge_page_shift(h), vma,
@@ -155,7 +164,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
ret = 0;
if (vma->vm_flags & VM_WRITE && inode->i_size < len)
- inode->i_size = len;
+ i_size_write(inode, len);
out:
inode_unlock(inode);
diff --git a/fs/inode.c b/fs/inode.c
index 88110fd0b282..131b2bcebc48 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -371,9 +371,6 @@ void inode_init_once(struct inode *inode)
INIT_LIST_HEAD(&inode->i_lru);
address_space_init_once(&inode->i_data);
i_size_ordered_init(inode);
-#ifdef CONFIG_FSNOTIFY
- INIT_HLIST_HEAD(&inode->i_fsnotify_marks);
-#endif
}
EXPORT_SYMBOL(inode_init_once);
diff --git a/fs/internal.h b/fs/internal.h
index 11c6d89dce9c..076751d90ba2 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -108,8 +108,6 @@ extern struct file *do_filp_open(int dfd, struct filename *pathname,
extern struct file *do_file_open_root(struct dentry *, struct vfsmount *,
const char *, const struct open_flags *);
-extern long do_handle_open(int mountdirfd,
- struct file_handle __user *ufh, int open_flag);
extern int open_check_o_direct(struct file *f);
extern int vfs_open(const struct path *, struct file *, const struct cred *);
extern struct file *filp_clone_open(struct file *);
diff --git a/fs/iomap.c b/fs/iomap.c
index 141c3cd55a8b..1faabe09b8fd 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -360,7 +360,8 @@ static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes,
sector_t sector = iomap->blkno +
(((pos & ~(PAGE_SIZE - 1)) - iomap->offset) >> 9);
- return __dax_zero_page_range(iomap->bdev, sector, offset, bytes);
+ return __dax_zero_page_range(iomap->bdev, iomap->dax_dev, sector,
+ offset, bytes);
}
static loff_t
@@ -887,16 +888,14 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
flags |= IOMAP_WRITE;
}
- if (mapping->nrpages) {
- ret = filemap_write_and_wait_range(mapping, start, end);
- if (ret)
- goto out_free_dio;
+ ret = filemap_write_and_wait_range(mapping, start, end);
+ if (ret)
+ goto out_free_dio;
- ret = invalidate_inode_pages2_range(mapping,
- start >> PAGE_SHIFT, end >> PAGE_SHIFT);
- WARN_ON_ONCE(ret);
- ret = 0;
- }
+ ret = invalidate_inode_pages2_range(mapping,
+ start >> PAGE_SHIFT, end >> PAGE_SHIFT);
+ WARN_ON_ONCE(ret);
+ ret = 0;
inode_dio_begin(inode);
@@ -911,6 +910,9 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
break;
}
pos += ret;
+
+ if (iov_iter_rw(iter) == READ && pos >= dio->i_size)
+ break;
} while ((count = iov_iter_count(iter)) > 0);
blk_finish_plug(&plug);
@@ -951,7 +953,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
* one is a pretty crazy thing to do, so we don't support it 100%. If
* this invalidation fails, tough, the write still worked...
*/
- if (iov_iter_rw(iter) == WRITE && mapping->nrpages) {
+ if (iov_iter_rw(iter) == WRITE) {
int err = invalidate_inode_pages2_range(mapping,
start >> PAGE_SHIFT, end >> PAGE_SHIFT);
WARN_ON_ONCE(err);
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 1ce13479b10d..5a0245e36240 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -43,6 +43,7 @@
#include <linux/backing-dev.h>
#include <linux/bitops.h>
#include <linux/ratelimit.h>
+#include <linux/sched/mm.h>
#define CREATE_TRACE_POINTS
#include <trace/events/jbd2.h>
@@ -206,6 +207,14 @@ static int kjournald2(void *arg)
wake_up(&journal->j_wait_done_commit);
/*
+ * Make sure that no allocations from this kernel thread will ever
+ * recurse to the fs layer because we are responsible for the
+ * transaction commit and any fs involvement might get stuck waiting for
+ * the trasn. commit.
+ */
+ memalloc_nofs_save();
+
+ /*
* And now, wait forever for commit wakeup events.
*/
write_lock(&journal->j_state_lock);
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 5e659ee08d6a..9ee4832b6f8b 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -29,6 +29,7 @@
#include <linux/backing-dev.h>
#include <linux/bug.h>
#include <linux/module.h>
+#include <linux/sched/mm.h>
#include <trace/events/jbd2.h>
@@ -388,6 +389,11 @@ repeat:
rwsem_acquire_read(&journal->j_trans_commit_map, 0, 0, _THIS_IP_);
jbd2_journal_free_transaction(new_transaction);
+ /*
+ * Ensure that no allocations done while the transaction is open are
+ * going to recurse back to the fs layer.
+ */
+ handle->saved_alloc_context = memalloc_nofs_save();
return 0;
}
@@ -466,6 +472,7 @@ handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int rsv_blocks,
trace_jbd2_handle_start(journal->j_fs_dev->bd_dev,
handle->h_transaction->t_tid, type,
line_no, nblocks);
+
return handle;
}
EXPORT_SYMBOL(jbd2__journal_start);
@@ -1760,6 +1767,11 @@ int jbd2_journal_stop(handle_t *handle)
if (handle->h_rsv_handle)
jbd2_journal_free_reserved(handle->h_rsv_handle);
free_and_exit:
+ /*
+ * Scope of the GFP_NOFS context is over here and so we can restore the
+ * original alloc context.
+ */
+ memalloc_nofs_restore(handle->saved_alloc_context);
jbd2_free_handle(handle);
return err;
}
diff --git a/fs/jfs/ioctl.c b/fs/jfs/ioctl.c
index fc89f9436784..5c5ac5b3aec3 100644
--- a/fs/jfs/ioctl.c
+++ b/fs/jfs/ioctl.c
@@ -64,7 +64,6 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
switch (cmd) {
case JFS_IOC_GETFLAGS:
- jfs_get_inode_flags(jfs_inode);
flags = jfs_inode->mode2 & JFS_FL_USER_VISIBLE;
flags = jfs_map_ext2(flags, 0);
return put_user(flags, (int __user *) arg);
@@ -98,7 +97,6 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
/* Lock against other parallel changes of flags */
inode_lock(inode);
- jfs_get_inode_flags(jfs_inode);
oldflags = jfs_inode->mode2;
/*
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index 6aca224a5d68..f36ef68905a7 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -3148,7 +3148,6 @@ static void copy_to_dinode(struct dinode * dip, struct inode *ip)
else
dip->di_gid = cpu_to_le32(from_kgid(&init_user_ns,
jfs_ip->saved_gid));
- jfs_get_inode_flags(jfs_ip);
/*
* mode2 is only needed for storing the higher order bits.
* Trust i_mode for the lower order ones
diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c
index 375dd257a34f..5e9b7bb3aabf 100644
--- a/fs/jfs/jfs_inode.c
+++ b/fs/jfs/jfs_inode.c
@@ -45,24 +45,6 @@ void jfs_set_inode_flags(struct inode *inode)
S_DIRSYNC | S_SYNC);
}
-void jfs_get_inode_flags(struct jfs_inode_info *jfs_ip)
-{
- unsigned int flags = jfs_ip->vfs_inode.i_flags;
-
- jfs_ip->mode2 &= ~(JFS_IMMUTABLE_FL | JFS_APPEND_FL | JFS_NOATIME_FL |
- JFS_DIRSYNC_FL | JFS_SYNC_FL);
- if (flags & S_IMMUTABLE)
- jfs_ip->mode2 |= JFS_IMMUTABLE_FL;
- if (flags & S_APPEND)
- jfs_ip->mode2 |= JFS_APPEND_FL;
- if (flags & S_NOATIME)
- jfs_ip->mode2 |= JFS_NOATIME_FL;
- if (flags & S_DIRSYNC)
- jfs_ip->mode2 |= JFS_DIRSYNC_FL;
- if (flags & S_SYNC)
- jfs_ip->mode2 |= JFS_SYNC_FL;
-}
-
/*
* NAME: ialloc()
*
diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h
index 9271cfe4a149..7b0b3a40788f 100644
--- a/fs/jfs/jfs_inode.h
+++ b/fs/jfs/jfs_inode.h
@@ -33,7 +33,6 @@ extern void jfs_truncate(struct inode *);
extern void jfs_truncate_nolock(struct inode *, loff_t);
extern void jfs_free_zero_link(struct inode *);
extern struct dentry *jfs_get_parent(struct dentry *dentry);
-extern void jfs_get_inode_flags(struct jfs_inode_info *);
extern struct dentry *jfs_fh_to_dentry(struct super_block *sb, struct fid *fid,
int fh_len, int fh_type);
extern struct dentry *jfs_fh_to_parent(struct super_block *sb, struct fid *fid,
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index c64c2574a0aa..e8aad7d87b8c 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -45,6 +45,7 @@
#include "jfs_acl.h"
#include "jfs_debug.h"
#include "jfs_xattr.h"
+#include "jfs_dinode.h"
MODULE_DESCRIPTION("The Journaled Filesystem (JFS)");
MODULE_AUTHOR("Steve Best/Dave Kleikamp/Barry Arndt, IBM");
@@ -181,6 +182,35 @@ static int jfs_statfs(struct dentry *dentry, struct kstatfs *buf)
return 0;
}
+#ifdef CONFIG_QUOTA
+static int jfs_quota_off(struct super_block *sb, int type);
+static int jfs_quota_on(struct super_block *sb, int type, int format_id,
+ const struct path *path);
+
+static void jfs_quota_off_umount(struct super_block *sb)
+{
+ int type;
+
+ for (type = 0; type < MAXQUOTAS; type++)
+ jfs_quota_off(sb, type);
+}
+
+static const struct quotactl_ops jfs_quotactl_ops = {
+ .quota_on = jfs_quota_on,
+ .quota_off = jfs_quota_off,
+ .quota_sync = dquot_quota_sync,
+ .get_state = dquot_get_state,
+ .set_info = dquot_set_dqinfo,
+ .get_dqblk = dquot_get_dqblk,
+ .set_dqblk = dquot_set_dqblk,
+ .get_nextdqblk = dquot_get_next_dqblk,
+};
+#else
+static inline void jfs_quota_off_umount(struct super_block *sb)
+{
+}
+#endif
+
static void jfs_put_super(struct super_block *sb)
{
struct jfs_sb_info *sbi = JFS_SBI(sb);
@@ -188,7 +218,7 @@ static void jfs_put_super(struct super_block *sb)
jfs_info("In jfs_put_super");
- dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
+ jfs_quota_off_umount(sb);
rc = jfs_umount(sb);
if (rc)
@@ -536,7 +566,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
sb->s_xattr = jfs_xattr_handlers;
#ifdef CONFIG_QUOTA
sb->dq_op = &dquot_operations;
- sb->s_qcop = &dquot_quotactl_ops;
+ sb->s_qcop = &jfs_quotactl_ops;
sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP;
#endif
@@ -840,6 +870,51 @@ static struct dquot **jfs_get_dquots(struct inode *inode)
{
return JFS_IP(inode)->i_dquot;
}
+
+static int jfs_quota_on(struct super_block *sb, int type, int format_id,
+ const struct path *path)
+{
+ int err;
+ struct inode *inode;
+
+ err = dquot_quota_on(sb, type, format_id, path);
+ if (err)
+ return err;
+
+ inode = d_inode(path->dentry);
+ inode_lock(inode);
+ JFS_IP(inode)->mode2 |= JFS_NOATIME_FL | JFS_IMMUTABLE_FL;
+ inode_set_flags(inode, S_NOATIME | S_IMMUTABLE,
+ S_NOATIME | S_IMMUTABLE);
+ inode_unlock(inode);
+ mark_inode_dirty(inode);
+
+ return 0;
+}
+
+static int jfs_quota_off(struct super_block *sb, int type)
+{
+ struct inode *inode = sb_dqopt(sb)->files[type];
+ int err;
+
+ if (!inode || !igrab(inode))
+ goto out;
+
+ err = dquot_quota_off(sb, type);
+ if (err)
+ goto out_put;
+
+ inode_lock(inode);
+ JFS_IP(inode)->mode2 &= ~(JFS_NOATIME_FL | JFS_IMMUTABLE_FL);
+ inode_set_flags(inode, 0, S_NOATIME | S_IMMUTABLE);
+ inode_unlock(inode);
+ mark_inode_dirty(inode);
+out_put:
+ iput(inode);
+ return err;
+out:
+ return dquot_quota_off(sb, type);
+}
#endif
static const struct super_operations jfs_super_operations = {
diff --git a/fs/mount.h b/fs/mount.h
index 2826543a131d..bf1fda6eed8f 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -59,7 +59,7 @@ struct mount {
struct mountpoint *mnt_mp; /* where is it mounted */
struct hlist_node mnt_mp_list; /* list mounts with the same mountpoint */
#ifdef CONFIG_FSNOTIFY
- struct hlist_head mnt_fsnotify_marks;
+ struct fsnotify_mark_connector __rcu *mnt_fsnotify_marks;
__u32 mnt_fsnotify_mask;
#endif
int mnt_id; /* mount identifier */
diff --git a/fs/namei.c b/fs/namei.c
index d41fab78798b..9a7f8bd748d8 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -340,22 +340,14 @@ int generic_permission(struct inode *inode, int mask)
if (S_ISDIR(inode->i_mode)) {
/* DACs are overridable for directories */
- if (capable_wrt_inode_uidgid(inode, CAP_DAC_OVERRIDE))
- return 0;
if (!(mask & MAY_WRITE))
if (capable_wrt_inode_uidgid(inode,
CAP_DAC_READ_SEARCH))
return 0;
- return -EACCES;
- }
- /*
- * Read/write DACs are always overridable.
- * Executable DACs are overridable when there is
- * at least one exec bit set.
- */
- if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO))
if (capable_wrt_inode_uidgid(inode, CAP_DAC_OVERRIDE))
return 0;
+ return -EACCES;
+ }
/*
* Searching includes executable on directories, else just read.
@@ -364,6 +356,14 @@ int generic_permission(struct inode *inode, int mask)
if (mask == MAY_READ)
if (capable_wrt_inode_uidgid(inode, CAP_DAC_READ_SEARCH))
return 0;
+ /*
+ * Read/write DACs are always overridable.
+ * Executable DACs are overridable when there is
+ * at least one exec bit set.
+ */
+ if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO))
+ if (capable_wrt_inode_uidgid(inode, CAP_DAC_OVERRIDE))
+ return 0;
return -EACCES;
}
@@ -2145,6 +2145,9 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
int retval = 0;
const char *s = nd->name->name;
+ if (!*s)
+ flags &= ~LOOKUP_RCU;
+
nd->last_type = LAST_ROOT; /* if there are only slashes... */
nd->flags = flags | LOOKUP_JUMPED | LOOKUP_PARENT;
nd->depth = 0;
diff --git a/fs/namespace.c b/fs/namespace.c
index cc1375eff88c..b3b115bd4e1e 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -236,9 +236,6 @@ static struct mount *alloc_vfsmnt(const char *name)
INIT_LIST_HEAD(&mnt->mnt_slave_list);
INIT_LIST_HEAD(&mnt->mnt_slave);
INIT_HLIST_NODE(&mnt->mnt_mp_list);
-#ifdef CONFIG_FSNOTIFY
- INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks);
-#endif
init_fs_pin(&mnt->mnt_umount, drop_mountpoint);
}
return mnt;
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index d5606099712a..6d0f14c86099 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -554,12 +554,11 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
sb->s_magic = NCP_SUPER_MAGIC;
sb->s_op = &ncp_sops;
sb->s_d_op = &ncp_dentry_operations;
- sb->s_bdi = &server->bdi;
server = NCP_SBP(sb);
memset(server, 0, sizeof(*server));
- error = bdi_setup_and_register(&server->bdi, "ncpfs");
+ error = super_setup_bdi(sb);
if (error)
goto out_fput;
@@ -568,7 +567,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
if (data.info_fd != -1) {
struct socket *info_sock = sockfd_lookup(data.info_fd, &error);
if (!info_sock)
- goto out_bdi;
+ goto out_fput;
server->info_sock = info_sock;
error = -EBADFD;
if (info_sock->type != SOCK_STREAM)
@@ -746,8 +745,6 @@ out_nls:
out_fput2:
if (server->info_sock)
sockfd_put(server->info_sock);
-out_bdi:
- bdi_destroy(&server->bdi);
out_fput:
sockfd_put(sock);
out:
@@ -788,7 +785,6 @@ static void ncp_put_super(struct super_block *sb)
kill_pid(server->m.wdog_pid, SIGTERM, 1);
put_pid(server->m.wdog_pid);
- bdi_destroy(&server->bdi);
kfree(server->priv.data);
kfree(server->auth.object_name);
vfree(server->rxbuf);
diff --git a/fs/ncpfs/ncp_fs_sb.h b/fs/ncpfs/ncp_fs_sb.h
index 55e26fd80886..366fd63cc506 100644
--- a/fs/ncpfs/ncp_fs_sb.h
+++ b/fs/ncpfs/ncp_fs_sb.h
@@ -143,7 +143,6 @@ struct ncp_server {
size_t len;
__u8 data[128];
} unexpected_packet;
- struct backing_dev_info bdi;
};
extern void ncp_tcp_rcv_proc(struct work_struct *work);
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 390ada8741bc..04d15a0045e3 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -761,9 +761,6 @@ static void nfs_server_set_fsinfo(struct nfs_server *server,
server->rsize = NFS_MAX_FILE_IO_SIZE;
server->rpages = (server->rsize + PAGE_SIZE - 1) >> PAGE_SHIFT;
- server->backing_dev_info.name = "nfs";
- server->backing_dev_info.ra_pages = server->rpages * NFS_MAX_READAHEAD;
-
if (server->wsize > max_rpc_payload)
server->wsize = max_rpc_payload;
if (server->wsize > NFS_MAX_FILE_IO_SIZE)
@@ -917,12 +914,6 @@ struct nfs_server *nfs_alloc_server(void)
return NULL;
}
- if (bdi_init(&server->backing_dev_info)) {
- nfs_free_iostats(server->io_stats);
- kfree(server);
- return NULL;
- }
-
ida_init(&server->openowner_id);
ida_init(&server->lockowner_id);
pnfs_init_server(server);
@@ -953,7 +944,6 @@ void nfs_free_server(struct nfs_server *server)
ida_destroy(&server->lockowner_id);
ida_destroy(&server->openowner_id);
nfs_free_iostats(server->io_stats);
- bdi_destroy(&server->backing_dev_info);
kfree(server);
nfs_release_automount_timer();
dprintk("<-- nfs_free_server()\n");
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index aab32fc3d6a8..c1b5fed7c863 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -537,7 +537,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
if (put_dreq(dreq))
nfs_direct_complete(dreq);
- return 0;
+ return requested_bytes;
}
/**
@@ -566,7 +566,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter)
struct inode *inode = mapping->host;
struct nfs_direct_req *dreq;
struct nfs_lock_context *l_ctx;
- ssize_t result = -EINVAL;
+ ssize_t result = -EINVAL, requested;
size_t count = iov_iter_count(iter);
nfs_add_stats(mapping->host, NFSIOS_DIRECTREADBYTES, count);
@@ -600,14 +600,19 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter)
nfs_start_io_direct(inode);
NFS_I(inode)->read_io += count;
- result = nfs_direct_read_schedule_iovec(dreq, iter, iocb->ki_pos);
+ requested = nfs_direct_read_schedule_iovec(dreq, iter, iocb->ki_pos);
nfs_end_io_direct(inode);
- if (!result) {
+ if (requested > 0) {
result = nfs_direct_wait(dreq);
- if (result > 0)
+ if (result > 0) {
+ requested -= result;
iocb->ki_pos += result;
+ }
+ iov_iter_revert(iter, requested);
+ } else {
+ result = requested;
}
out_release:
@@ -954,7 +959,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
if (put_dreq(dreq))
nfs_direct_write_complete(dreq);
- return 0;
+ return requested_bytes;
}
/**
@@ -979,7 +984,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
*/
ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
{
- ssize_t result = -EINVAL;
+ ssize_t result = -EINVAL, requested;
size_t count;
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
@@ -1022,7 +1027,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
nfs_start_io_direct(inode);
- result = nfs_direct_write_schedule_iovec(dreq, iter, pos);
+ requested = nfs_direct_write_schedule_iovec(dreq, iter, pos);
if (mapping->nrpages) {
invalidate_inode_pages2_range(mapping,
@@ -1031,13 +1036,17 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
nfs_end_io_direct(inode);
- if (!result) {
+ if (requested > 0) {
result = nfs_direct_wait(dreq);
if (result > 0) {
+ requested -= result;
iocb->ki_pos = pos + result;
/* XXX: should check the generic_write_sync retval */
generic_write_sync(iocb, result);
}
+ iov_iter_revert(iter, requested);
+ } else {
+ result = requested;
}
out_release:
nfs_direct_req_release(dreq);
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 54e0f9f2dd94..2f3822a4a7d5 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -2315,8 +2315,6 @@ inline void nfs_initialise_sb(struct super_block *sb)
sb->s_blocksize = nfs_block_bits(server->wsize,
&sb->s_blocksize_bits);
- sb->s_bdi = &server->backing_dev_info;
-
nfs_super_set_maxbytes(sb, server->maxfilesize);
}
@@ -2522,11 +2520,6 @@ static void nfs_get_cache_cookie(struct super_block *sb,
}
#endif
-static int nfs_bdi_register(struct nfs_server *server)
-{
- return bdi_register_dev(&server->backing_dev_info, server->s_dev);
-}
-
int nfs_set_sb_security(struct super_block *s, struct dentry *mntroot,
struct nfs_mount_info *mount_info)
{
@@ -2594,11 +2587,13 @@ struct dentry *nfs_fs_mount_common(struct nfs_server *server,
nfs_free_server(server);
server = NULL;
} else {
- error = nfs_bdi_register(server);
+ error = super_setup_bdi_name(s, "%u:%u", MAJOR(server->s_dev),
+ MINOR(server->s_dev));
if (error) {
mntroot = ERR_PTR(error);
goto error_splat_super;
}
+ s->s_bdi->ra_pages = server->rpages * NFS_MAX_READAHEAD;
server->super = s;
}
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index abb2c8a3be42..cc341fc7fd44 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -263,16 +263,15 @@ int nfs_congestion_kb;
static void nfs_set_page_writeback(struct page *page)
{
- struct nfs_server *nfss = NFS_SERVER(page_file_mapping(page)->host);
+ struct inode *inode = page_file_mapping(page)->host;
+ struct nfs_server *nfss = NFS_SERVER(inode);
int ret = test_set_page_writeback(page);
WARN_ON_ONCE(ret != 0);
if (atomic_long_inc_return(&nfss->writeback) >
- NFS_CONGESTION_ON_THRESH) {
- set_bdi_congested(&nfss->backing_dev_info,
- BLK_RW_ASYNC);
- }
+ NFS_CONGESTION_ON_THRESH)
+ set_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC);
}
static void nfs_end_page_writeback(struct nfs_page *req)
@@ -285,7 +284,7 @@ static void nfs_end_page_writeback(struct nfs_page *req)
end_page_writeback(req->wb_page);
if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH)
- clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC);
+ clear_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC);
}
@@ -1808,7 +1807,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data)
}
nfss = NFS_SERVER(data->inode);
if (atomic_long_read(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH)
- clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC);
+ clear_bdi_congested(inode_to_bdi(data->inode), BLK_RW_ASYNC);
nfs_init_cinfo(&cinfo, data->inode, data->dreq);
nfs_commit_end(cinfo.mds);
diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
index 92b4b41d19d2..fb5213afc854 100644
--- a/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c
@@ -242,10 +242,11 @@ static int nfsd4_scsi_identify_device(struct block_device *bdev,
req->cmd[4] = bufflen & 0xff;
req->cmd_len = COMMAND_SIZE(INQUIRY);
- error = blk_execute_rq(rq->q, NULL, rq, 1);
- if (error) {
+ blk_execute_rq(rq->q, NULL, rq, 1);
+ if (req->result) {
pr_err("pNFS: INQUIRY 0x83 failed with: %x\n",
- rq->errors);
+ req->result);
+ error = -EIO;
goto out_put_request;
}
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index dba2ff8eaa68..452334694a5d 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -358,6 +358,8 @@ nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p,
{
unsigned int len, v, hdr, dlen;
u32 max_blocksize = svc_max_payload(rqstp);
+ struct kvec *head = rqstp->rq_arg.head;
+ struct kvec *tail = rqstp->rq_arg.tail;
p = decode_fh(p, &args->fh);
if (!p)
@@ -367,6 +369,8 @@ nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p,
args->count = ntohl(*p++);
args->stable = ntohl(*p++);
len = args->len = ntohl(*p++);
+ if ((void *)p > head->iov_base + head->iov_len)
+ return 0;
/*
* The count must equal the amount of data passed.
*/
@@ -377,9 +381,8 @@ nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p,
* Check to make sure that we got the right number of
* bytes.
*/
- hdr = (void*)p - rqstp->rq_arg.head[0].iov_base;
- dlen = rqstp->rq_arg.head[0].iov_len + rqstp->rq_arg.page_len
- + rqstp->rq_arg.tail[0].iov_len - hdr;
+ hdr = (void*)p - head->iov_base;
+ dlen = head->iov_len + rqstp->rq_arg.page_len + tail->iov_len - hdr;
/*
* Round the length of the data which was specified up to
* the next multiple of XDR units and then compare that
@@ -396,7 +399,7 @@ nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p,
len = args->len = max_blocksize;
}
rqstp->rq_vec[0].iov_base = (void*)p;
- rqstp->rq_vec[0].iov_len = rqstp->rq_arg.head[0].iov_len - hdr;
+ rqstp->rq_vec[0].iov_len = head->iov_len - hdr;
v = 0;
while (len > rqstp->rq_vec[v].iov_len) {
len -= rqstp->rq_vec[v].iov_len;
@@ -471,6 +474,8 @@ nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, __be32 *p,
/* first copy and check from the first page */
old = (char*)p;
vec = &rqstp->rq_arg.head[0];
+ if ((void *)old > vec->iov_base + vec->iov_len)
+ return 0;
avail = vec->iov_len - (old - (char*)vec->iov_base);
while (len && avail && *old) {
*new++ = *old++;
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index cbeeda1e94a2..d86031b6ad79 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -2489,7 +2489,7 @@ bool nfsd4_spo_must_allow(struct svc_rqst *rqstp)
int nfsd4_max_reply(struct svc_rqst *rqstp, struct nfsd4_op *op)
{
- if (op->opnum == OP_ILLEGAL)
+ if (op->opnum == OP_ILLEGAL || op->status == nfserr_notsupp)
return op_encode_hdr_size * sizeof(__be32);
BUG_ON(OPDESC(op)->op_rsize_bop == NULL);
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 31e1f9593457..59979f0bbd4b 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -747,6 +747,37 @@ static __be32 map_new_errors(u32 vers, __be32 nfserr)
return nfserr;
}
+/*
+ * A write procedure can have a large argument, and a read procedure can
+ * have a large reply, but no NFSv2 or NFSv3 procedure has argument and
+ * reply that can both be larger than a page. The xdr code has taken
+ * advantage of this assumption to be a sloppy about bounds checking in
+ * some cases. Pending a rewrite of the NFSv2/v3 xdr code to fix that
+ * problem, we enforce these assumptions here:
+ */
+static bool nfs_request_too_big(struct svc_rqst *rqstp,
+ struct svc_procedure *proc)
+{
+ /*
+ * The ACL code has more careful bounds-checking and is not
+ * susceptible to this problem:
+ */
+ if (rqstp->rq_prog != NFS_PROGRAM)
+ return false;
+ /*
+ * Ditto NFSv4 (which can in theory have argument and reply both
+ * more than a page):
+ */
+ if (rqstp->rq_vers >= 4)
+ return false;
+ /* The reply will be small, we're OK: */
+ if (proc->pc_xdrressize > 0 &&
+ proc->pc_xdrressize < XDR_QUADLEN(PAGE_SIZE))
+ return false;
+
+ return rqstp->rq_arg.len > PAGE_SIZE;
+}
+
int
nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
{
@@ -759,6 +790,11 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
rqstp->rq_vers, rqstp->rq_proc);
proc = rqstp->rq_procinfo;
+ if (nfs_request_too_big(rqstp, proc)) {
+ dprintk("nfsd: NFSv%d argument too large\n", rqstp->rq_vers);
+ *statp = rpc_garbage_args;
+ return 1;
+ }
/*
* Give the xdr decoder a chance to change this if it wants
* (necessary in the NFSv4.0 compound case)
diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c
index 41b468a6a90f..de07ff625777 100644
--- a/fs/nfsd/nfsxdr.c
+++ b/fs/nfsd/nfsxdr.c
@@ -280,6 +280,7 @@ nfssvc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p,
struct nfsd_writeargs *args)
{
unsigned int len, hdr, dlen;
+ struct kvec *head = rqstp->rq_arg.head;
int v;
p = decode_fh(p, &args->fh);
@@ -300,9 +301,10 @@ nfssvc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p,
* Check to make sure that we got the right number of
* bytes.
*/
- hdr = (void*)p - rqstp->rq_arg.head[0].iov_base;
- dlen = rqstp->rq_arg.head[0].iov_len + rqstp->rq_arg.page_len
- - hdr;
+ hdr = (void*)p - head->iov_base;
+ if (hdr > head->iov_len)
+ return 0;
+ dlen = head->iov_len + rqstp->rq_arg.page_len - hdr;
/*
* Round the length of the data which was specified up to
@@ -316,7 +318,7 @@ nfssvc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p,
return 0;
rqstp->rq_vec[0].iov_base = (void*)p;
- rqstp->rq_vec[0].iov_len = rqstp->rq_arg.head[0].iov_len - hdr;
+ rqstp->rq_vec[0].iov_len = head->iov_len - hdr;
v = 0;
while (len > rqstp->rq_vec[v].iov_len) {
len -= rqstp->rq_vec[v].iov_len;
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 19d50f600e8d..9aaf6ca77569 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1004,7 +1004,7 @@ out_nfserr:
else
err = nfserrno(host_err);
if (test_bit(RQ_LOCAL, &rqstp->rq_flags))
- tsk_restore_flags(current, pflags, PF_LESS_THROTTLE);
+ current_restore_flags(pflags, PF_LESS_THROTTLE);
return err;
}
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index e1872f36147f..926682981d61 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -1068,7 +1068,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent)
sb->s_time_gran = 1;
sb->s_max_links = NILFS_LINK_MAX;
- sb->s_bdi = bdev_get_queue(sb->s_bdev)->backing_dev_info;
+ sb->s_bdi = bdi_get(sb->s_bdev->bd_bdi);
err = load_nilfs(nilfs, sb);
if (err)
diff --git a/fs/notify/Makefile b/fs/notify/Makefile
index 96d3420d0242..3e969ae91b60 100644
--- a/fs/notify/Makefile
+++ b/fs/notify/Makefile
@@ -1,5 +1,5 @@
-obj-$(CONFIG_FSNOTIFY) += fsnotify.o notification.o group.o inode_mark.o \
- mark.o vfsmount_mark.o fdinfo.o
+obj-$(CONFIG_FSNOTIFY) += fsnotify.o notification.o group.o mark.o \
+ fdinfo.o
obj-y += dnotify/
obj-y += inotify/
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index 5a4ec309e283..2430a0415995 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -52,7 +52,7 @@ struct dnotify_mark {
*/
static void dnotify_recalc_inode_mask(struct fsnotify_mark *fsn_mark)
{
- __u32 new_mask, old_mask;
+ __u32 new_mask = 0;
struct dnotify_struct *dn;
struct dnotify_mark *dn_mark = container_of(fsn_mark,
struct dnotify_mark,
@@ -60,17 +60,13 @@ static void dnotify_recalc_inode_mask(struct fsnotify_mark *fsn_mark)
assert_spin_locked(&fsn_mark->lock);
- old_mask = fsn_mark->mask;
- new_mask = 0;
for (dn = dn_mark->dn; dn != NULL; dn = dn->dn_next)
new_mask |= (dn->dn_mask & ~FS_DN_MULTISHOT);
- fsnotify_set_mark_mask_locked(fsn_mark, new_mask);
-
- if (old_mask == new_mask)
+ if (fsn_mark->mask == new_mask)
return;
+ fsn_mark->mask = new_mask;
- if (fsn_mark->inode)
- fsnotify_recalc_inode_mask(fsn_mark->inode);
+ fsnotify_recalc_mask(fsn_mark->connector);
}
/*
@@ -86,7 +82,8 @@ static int dnotify_handle_event(struct fsnotify_group *group,
struct fsnotify_mark *inode_mark,
struct fsnotify_mark *vfsmount_mark,
u32 mask, const void *data, int data_type,
- const unsigned char *file_name, u32 cookie)
+ const unsigned char *file_name, u32 cookie,
+ struct fsnotify_iter_info *iter_info)
{
struct dnotify_mark *dn_mark;
struct dnotify_struct *dn;
@@ -138,6 +135,7 @@ static void dnotify_free_mark(struct fsnotify_mark *fsn_mark)
static struct fsnotify_ops dnotify_fsnotify_ops = {
.handle_event = dnotify_handle_event,
+ .free_mark = dnotify_free_mark,
};
/*
@@ -160,7 +158,7 @@ void dnotify_flush(struct file *filp, fl_owner_t id)
if (!S_ISDIR(inode->i_mode))
return;
- fsn_mark = fsnotify_find_inode_mark(dnotify_group, inode);
+ fsn_mark = fsnotify_find_mark(&inode->i_fsnotify_marks, dnotify_group);
if (!fsn_mark)
return;
dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark);
@@ -308,7 +306,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
/* set up the new_fsn_mark and new_dn_mark */
new_fsn_mark = &new_dn_mark->fsn_mark;
- fsnotify_init_mark(new_fsn_mark, dnotify_free_mark);
+ fsnotify_init_mark(new_fsn_mark, dnotify_group);
new_fsn_mark->mask = mask;
new_dn_mark->dn = NULL;
@@ -316,13 +314,12 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
mutex_lock(&dnotify_group->mark_mutex);
/* add the new_fsn_mark or find an old one. */
- fsn_mark = fsnotify_find_inode_mark(dnotify_group, inode);
+ fsn_mark = fsnotify_find_mark(&inode->i_fsnotify_marks, dnotify_group);
if (fsn_mark) {
dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark);
spin_lock(&fsn_mark->lock);
} else {
- fsnotify_add_mark_locked(new_fsn_mark, dnotify_group, inode,
- NULL, 0);
+ fsnotify_add_mark_locked(new_fsn_mark, inode, NULL, 0);
spin_lock(&new_fsn_mark->lock);
fsn_mark = new_fsn_mark;
dn_mark = new_dn_mark;
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index e5f7e47de68e..2fa99aeaa095 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -57,14 +57,26 @@ static int fanotify_merge(struct list_head *list, struct fsnotify_event *event)
#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
static int fanotify_get_response(struct fsnotify_group *group,
- struct fanotify_perm_event_info *event)
+ struct fanotify_perm_event_info *event,
+ struct fsnotify_iter_info *iter_info)
{
int ret;
pr_debug("%s: group=%p event=%p\n", __func__, group, event);
+ /*
+ * fsnotify_prepare_user_wait() fails if we race with mark deletion.
+ * Just let the operation pass in that case.
+ */
+ if (!fsnotify_prepare_user_wait(iter_info)) {
+ event->response = FAN_ALLOW;
+ goto out;
+ }
+
wait_event(group->fanotify_data.access_waitq, event->response);
+ fsnotify_finish_user_wait(iter_info);
+out:
/* userspace responded, convert to something usable */
switch (event->response) {
case FAN_ALLOW:
@@ -174,7 +186,8 @@ static int fanotify_handle_event(struct fsnotify_group *group,
struct fsnotify_mark *inode_mark,
struct fsnotify_mark *fanotify_mark,
u32 mask, const void *data, int data_type,
- const unsigned char *file_name, u32 cookie)
+ const unsigned char *file_name, u32 cookie,
+ struct fsnotify_iter_info *iter_info)
{
int ret = 0;
struct fanotify_event_info *event;
@@ -215,7 +228,8 @@ static int fanotify_handle_event(struct fsnotify_group *group,
#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
if (mask & FAN_ALL_PERM_EVENTS) {
- ret = fanotify_get_response(group, FANOTIFY_PE(fsn_event));
+ ret = fanotify_get_response(group, FANOTIFY_PE(fsn_event),
+ iter_info);
fsnotify_destroy_event(group, fsn_event);
}
#endif
@@ -248,8 +262,14 @@ static void fanotify_free_event(struct fsnotify_event *fsn_event)
kmem_cache_free(fanotify_event_cachep, event);
}
+static void fanotify_free_mark(struct fsnotify_mark *fsn_mark)
+{
+ kmem_cache_free(fanotify_mark_cache, fsn_mark);
+}
+
const struct fsnotify_ops fanotify_fsnotify_ops = {
.handle_event = fanotify_handle_event,
.free_group_priv = fanotify_free_group_priv,
.free_event = fanotify_free_event,
+ .free_mark = fanotify_free_mark,
};
diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h
index 4500a74f8d38..4eb6f5efa282 100644
--- a/fs/notify/fanotify/fanotify.h
+++ b/fs/notify/fanotify/fanotify.h
@@ -2,6 +2,7 @@
#include <linux/path.h>
#include <linux/slab.h>
+extern struct kmem_cache *fanotify_mark_cache;
extern struct kmem_cache *fanotify_event_cachep;
extern struct kmem_cache *fanotify_perm_event_cachep;
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 2b37f2785834..907a481ac781 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -41,7 +41,7 @@
extern const struct fsnotify_ops fanotify_fsnotify_ops;
-static struct kmem_cache *fanotify_mark_cache __read_mostly;
+struct kmem_cache *fanotify_mark_cache __read_mostly;
struct kmem_cache *fanotify_event_cachep __read_mostly;
struct kmem_cache *fanotify_perm_event_cachep __read_mostly;
@@ -295,27 +295,37 @@ static ssize_t fanotify_read(struct file *file, char __user *buf,
}
ret = copy_event_to_user(group, kevent, buf);
+ if (unlikely(ret == -EOPENSTALE)) {
+ /*
+ * We cannot report events with stale fd so drop it.
+ * Setting ret to 0 will continue the event loop and
+ * do the right thing if there are no more events to
+ * read (i.e. return bytes read, -EAGAIN or wait).
+ */
+ ret = 0;
+ }
+
/*
* Permission events get queued to wait for response. Other
* events can be destroyed now.
*/
if (!(kevent->mask & FAN_ALL_PERM_EVENTS)) {
fsnotify_destroy_event(group, kevent);
- if (ret < 0)
- break;
} else {
#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
- if (ret < 0) {
+ if (ret <= 0) {
FANOTIFY_PE(kevent)->response = FAN_DENY;
wake_up(&group->fanotify_data.access_waitq);
- break;
+ } else {
+ spin_lock(&group->notification_lock);
+ list_add_tail(&kevent->list,
+ &group->fanotify_data.access_list);
+ spin_unlock(&group->notification_lock);
}
- spin_lock(&group->notification_lock);
- list_add_tail(&kevent->list,
- &group->fanotify_data.access_list);
- spin_unlock(&group->notification_lock);
#endif
}
+ if (ret < 0)
+ break;
buf += ret;
count -= ret;
}
@@ -445,11 +455,6 @@ static const struct file_operations fanotify_fops = {
.llseek = noop_llseek,
};
-static void fanotify_free_mark(struct fsnotify_mark *fsn_mark)
-{
- kmem_cache_free(fanotify_mark_cache, fsn_mark);
-}
-
static int fanotify_find_path(int dfd, const char __user *filename,
struct path *path, unsigned int flags)
{
@@ -511,13 +516,12 @@ static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark,
tmask &= ~FAN_ONDIR;
oldmask = fsn_mark->mask;
- fsnotify_set_mark_mask_locked(fsn_mark, tmask);
+ fsn_mark->mask = tmask;
} else {
__u32 tmask = fsn_mark->ignored_mask & ~mask;
if (flags & FAN_MARK_ONDIR)
tmask &= ~FAN_ONDIR;
-
- fsnotify_set_mark_ignored_mask_locked(fsn_mark, tmask);
+ fsn_mark->ignored_mask = tmask;
}
*destroy = !(fsn_mark->mask | fsn_mark->ignored_mask);
spin_unlock(&fsn_mark->lock);
@@ -534,7 +538,8 @@ static int fanotify_remove_vfsmount_mark(struct fsnotify_group *group,
int destroy_mark;
mutex_lock(&group->mark_mutex);
- fsn_mark = fsnotify_find_vfsmount_mark(group, mnt);
+ fsn_mark = fsnotify_find_mark(&real_mount(mnt)->mnt_fsnotify_marks,
+ group);
if (!fsn_mark) {
mutex_unlock(&group->mark_mutex);
return -ENOENT;
@@ -542,6 +547,8 @@ static int fanotify_remove_vfsmount_mark(struct fsnotify_group *group,
removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags,
&destroy_mark);
+ if (removed & real_mount(mnt)->mnt_fsnotify_mask)
+ fsnotify_recalc_mask(real_mount(mnt)->mnt_fsnotify_marks);
if (destroy_mark)
fsnotify_detach_mark(fsn_mark);
mutex_unlock(&group->mark_mutex);
@@ -549,9 +556,6 @@ static int fanotify_remove_vfsmount_mark(struct fsnotify_group *group,
fsnotify_free_mark(fsn_mark);
fsnotify_put_mark(fsn_mark);
- if (removed & real_mount(mnt)->mnt_fsnotify_mask)
- fsnotify_recalc_vfsmount_mask(mnt);
-
return 0;
}
@@ -564,7 +568,7 @@ static int fanotify_remove_inode_mark(struct fsnotify_group *group,
int destroy_mark;
mutex_lock(&group->mark_mutex);
- fsn_mark = fsnotify_find_inode_mark(group, inode);
+ fsn_mark = fsnotify_find_mark(&inode->i_fsnotify_marks, group);
if (!fsn_mark) {
mutex_unlock(&group->mark_mutex);
return -ENOENT;
@@ -572,16 +576,16 @@ static int fanotify_remove_inode_mark(struct fsnotify_group *group,
removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags,
&destroy_mark);
+ if (removed & inode->i_fsnotify_mask)
+ fsnotify_recalc_mask(inode->i_fsnotify_marks);
if (destroy_mark)
fsnotify_detach_mark(fsn_mark);
mutex_unlock(&group->mark_mutex);
if (destroy_mark)
fsnotify_free_mark(fsn_mark);
- /* matches the fsnotify_find_inode_mark() */
+ /* matches the fsnotify_find_mark() */
fsnotify_put_mark(fsn_mark);
- if (removed & inode->i_fsnotify_mask)
- fsnotify_recalc_inode_mask(inode);
return 0;
}
@@ -600,13 +604,13 @@ static __u32 fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark,
tmask |= FAN_ONDIR;
oldmask = fsn_mark->mask;
- fsnotify_set_mark_mask_locked(fsn_mark, tmask);
+ fsn_mark->mask = tmask;
} else {
__u32 tmask = fsn_mark->ignored_mask | mask;
if (flags & FAN_MARK_ONDIR)
tmask |= FAN_ONDIR;
- fsnotify_set_mark_ignored_mask_locked(fsn_mark, tmask);
+ fsn_mark->ignored_mask = tmask;
if (flags & FAN_MARK_IGNORED_SURV_MODIFY)
fsn_mark->flags |= FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY;
}
@@ -629,8 +633,8 @@ static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group,
if (!mark)
return ERR_PTR(-ENOMEM);
- fsnotify_init_mark(mark, fanotify_free_mark);
- ret = fsnotify_add_mark_locked(mark, group, inode, mnt, 0);
+ fsnotify_init_mark(mark, group);
+ ret = fsnotify_add_mark_locked(mark, inode, mnt, 0);
if (ret) {
fsnotify_put_mark(mark);
return ERR_PTR(ret);
@@ -648,7 +652,8 @@ static int fanotify_add_vfsmount_mark(struct fsnotify_group *group,
__u32 added;
mutex_lock(&group->mark_mutex);
- fsn_mark = fsnotify_find_vfsmount_mark(group, mnt);
+ fsn_mark = fsnotify_find_mark(&real_mount(mnt)->mnt_fsnotify_marks,
+ group);
if (!fsn_mark) {
fsn_mark = fanotify_add_new_mark(group, NULL, mnt);
if (IS_ERR(fsn_mark)) {
@@ -657,10 +662,9 @@ static int fanotify_add_vfsmount_mark(struct fsnotify_group *group,
}
}
added = fanotify_mark_add_to_mask(fsn_mark, mask, flags);
- mutex_unlock(&group->mark_mutex);
-
if (added & ~real_mount(mnt)->mnt_fsnotify_mask)
- fsnotify_recalc_vfsmount_mask(mnt);
+ fsnotify_recalc_mask(real_mount(mnt)->mnt_fsnotify_marks);
+ mutex_unlock(&group->mark_mutex);
fsnotify_put_mark(fsn_mark);
return 0;
@@ -686,7 +690,7 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group,
return 0;
mutex_lock(&group->mark_mutex);
- fsn_mark = fsnotify_find_inode_mark(group, inode);
+ fsn_mark = fsnotify_find_mark(&inode->i_fsnotify_marks, group);
if (!fsn_mark) {
fsn_mark = fanotify_add_new_mark(group, inode, NULL);
if (IS_ERR(fsn_mark)) {
@@ -695,10 +699,9 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group,
}
}
added = fanotify_mark_add_to_mask(fsn_mark, mask, flags);
- mutex_unlock(&group->mark_mutex);
-
if (added & ~inode->i_fsnotify_mask)
- fsnotify_recalc_inode_mask(inode);
+ fsnotify_recalc_mask(inode->i_fsnotify_marks);
+ mutex_unlock(&group->mark_mutex);
fsnotify_put_mark(fsn_mark);
return 0;
diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c
index fd98e5100cab..dd63aa9a6f9a 100644
--- a/fs/notify/fdinfo.c
+++ b/fs/notify/fdinfo.c
@@ -76,12 +76,11 @@ static void inotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark)
struct inotify_inode_mark *inode_mark;
struct inode *inode;
- if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE) ||
- !(mark->flags & FSNOTIFY_MARK_FLAG_INODE))
+ if (!(mark->connector->flags & FSNOTIFY_OBJ_TYPE_INODE))
return;
inode_mark = container_of(mark, struct inotify_inode_mark, fsn_mark);
- inode = igrab(mark->inode);
+ inode = igrab(mark->connector->inode);
if (inode) {
/*
* IN_ALL_EVENTS represents all of the mask bits
@@ -113,14 +112,11 @@ static void fanotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark)
unsigned int mflags = 0;
struct inode *inode;
- if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE))
- return;
-
if (mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)
mflags |= FAN_MARK_IGNORED_SURV_MODIFY;
- if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) {
- inode = igrab(mark->inode);
+ if (mark->connector->flags & FSNOTIFY_OBJ_TYPE_INODE) {
+ inode = igrab(mark->connector->inode);
if (!inode)
return;
seq_printf(m, "fanotify ino:%lx sdev:%x mflags:%x mask:%x ignored_mask:%x ",
@@ -129,8 +125,8 @@ static void fanotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark)
show_mark_fhandle(m, inode);
seq_putc(m, '\n');
iput(inode);
- } else if (mark->flags & FSNOTIFY_MARK_FLAG_VFSMOUNT) {
- struct mount *mnt = real_mount(mark->mnt);
+ } else if (mark->connector->flags & FSNOTIFY_OBJ_TYPE_VFSMOUNT) {
+ struct mount *mnt = real_mount(mark->connector->mnt);
seq_printf(m, "fanotify mnt_id:%x mflags:%x mask:%x ignored_mask:%x\n",
mnt->mnt_id, mflags, mark->mask, mark->ignored_mask);
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index b41515d3f081..01a9f0f007d4 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -41,6 +41,63 @@ void __fsnotify_vfsmount_delete(struct vfsmount *mnt)
fsnotify_clear_marks_by_mount(mnt);
}
+/**
+ * fsnotify_unmount_inodes - an sb is unmounting. handle any watched inodes.
+ * @sb: superblock being unmounted.
+ *
+ * Called during unmount with no locks held, so needs to be safe against
+ * concurrent modifiers. We temporarily drop sb->s_inode_list_lock and CAN block.
+ */
+void fsnotify_unmount_inodes(struct super_block *sb)
+{
+ struct inode *inode, *iput_inode = NULL;
+
+ spin_lock(&sb->s_inode_list_lock);
+ list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
+ /*
+ * We cannot __iget() an inode in state I_FREEING,
+ * I_WILL_FREE, or I_NEW which is fine because by that point
+ * the inode cannot have any associated watches.
+ */
+ spin_lock(&inode->i_lock);
+ if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) {
+ spin_unlock(&inode->i_lock);
+ continue;
+ }
+
+ /*
+ * If i_count is zero, the inode cannot have any watches and
+ * doing an __iget/iput with MS_ACTIVE clear would actually
+ * evict all inodes with zero i_count from icache which is
+ * unnecessarily violent and may in fact be illegal to do.
+ */
+ if (!atomic_read(&inode->i_count)) {
+ spin_unlock(&inode->i_lock);
+ continue;
+ }
+
+ __iget(inode);
+ spin_unlock(&inode->i_lock);
+ spin_unlock(&sb->s_inode_list_lock);
+
+ if (iput_inode)
+ iput(iput_inode);
+
+ /* for each watch, send FS_UNMOUNT and then remove it */
+ fsnotify(inode, FS_UNMOUNT, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
+
+ fsnotify_inode_delete(inode);
+
+ iput_inode = inode;
+
+ spin_lock(&sb->s_inode_list_lock);
+ }
+ spin_unlock(&sb->s_inode_list_lock);
+
+ if (iput_inode)
+ iput(iput_inode);
+}
+
/*
* Given an inode, first check if we care what happens to our children. Inotify
* and dnotify both tell their parents about events. If we care about any event
@@ -127,7 +184,8 @@ static int send_to_group(struct inode *to_tell,
struct fsnotify_mark *vfsmount_mark,
__u32 mask, const void *data,
int data_is, u32 cookie,
- const unsigned char *file_name)
+ const unsigned char *file_name,
+ struct fsnotify_iter_info *iter_info)
{
struct fsnotify_group *group = NULL;
__u32 inode_test_mask = 0;
@@ -178,7 +236,7 @@ static int send_to_group(struct inode *to_tell,
return group->ops->handle_event(group, to_tell, inode_mark,
vfsmount_mark, mask, data, data_is,
- file_name, cookie);
+ file_name, cookie, iter_info);
}
/*
@@ -193,8 +251,10 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is,
struct hlist_node *inode_node = NULL, *vfsmount_node = NULL;
struct fsnotify_mark *inode_mark = NULL, *vfsmount_mark = NULL;
struct fsnotify_group *inode_group, *vfsmount_group;
+ struct fsnotify_mark_connector *inode_conn, *vfsmount_conn;
+ struct fsnotify_iter_info iter_info;
struct mount *mnt;
- int idx, ret = 0;
+ int ret = 0;
/* global tests shouldn't care about events on child only the specific event */
__u32 test_mask = (mask & ~FS_EVENT_ON_CHILD);
@@ -210,8 +270,8 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is,
* SRCU because we have no references to any objects and do not
* need SRCU to keep them "alive".
*/
- if (hlist_empty(&to_tell->i_fsnotify_marks) &&
- (!mnt || hlist_empty(&mnt->mnt_fsnotify_marks)))
+ if (!to_tell->i_fsnotify_marks &&
+ (!mnt || !mnt->mnt_fsnotify_marks))
return 0;
/*
* if this is a modify event we may need to clear the ignored masks
@@ -223,19 +283,30 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is,
!(mnt && test_mask & mnt->mnt_fsnotify_mask))
return 0;
- idx = srcu_read_lock(&fsnotify_mark_srcu);
+ iter_info.srcu_idx = srcu_read_lock(&fsnotify_mark_srcu);
if ((mask & FS_MODIFY) ||
- (test_mask & to_tell->i_fsnotify_mask))
- inode_node = srcu_dereference(to_tell->i_fsnotify_marks.first,
+ (test_mask & to_tell->i_fsnotify_mask)) {
+ inode_conn = srcu_dereference(to_tell->i_fsnotify_marks,
&fsnotify_mark_srcu);
+ if (inode_conn)
+ inode_node = srcu_dereference(inode_conn->list.first,
+ &fsnotify_mark_srcu);
+ }
if (mnt && ((mask & FS_MODIFY) ||
(test_mask & mnt->mnt_fsnotify_mask))) {
- vfsmount_node = srcu_dereference(mnt->mnt_fsnotify_marks.first,
- &fsnotify_mark_srcu);
- inode_node = srcu_dereference(to_tell->i_fsnotify_marks.first,
+ inode_conn = srcu_dereference(to_tell->i_fsnotify_marks,
&fsnotify_mark_srcu);
+ if (inode_conn)
+ inode_node = srcu_dereference(inode_conn->list.first,
+ &fsnotify_mark_srcu);
+ vfsmount_conn = srcu_dereference(mnt->mnt_fsnotify_marks,
+ &fsnotify_mark_srcu);
+ if (vfsmount_conn)
+ vfsmount_node = srcu_dereference(
+ vfsmount_conn->list.first,
+ &fsnotify_mark_srcu);
}
/*
@@ -272,8 +343,13 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is,
vfsmount_mark = NULL;
}
}
+
+ iter_info.inode_mark = inode_mark;
+ iter_info.vfsmount_mark = vfsmount_mark;
+
ret = send_to_group(to_tell, inode_mark, vfsmount_mark, mask,
- data, data_is, cookie, file_name);
+ data, data_is, cookie, file_name,
+ &iter_info);
if (ret && (mask & ALL_FSNOTIFY_PERM_EVENTS))
goto out;
@@ -287,12 +363,14 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is,
}
ret = 0;
out:
- srcu_read_unlock(&fsnotify_mark_srcu, idx);
+ srcu_read_unlock(&fsnotify_mark_srcu, iter_info.srcu_idx);
return ret;
}
EXPORT_SYMBOL_GPL(fsnotify);
+extern struct kmem_cache *fsnotify_mark_connector_cachep;
+
static __init int fsnotify_init(void)
{
int ret;
@@ -303,6 +381,9 @@ static __init int fsnotify_init(void)
if (ret)
panic("initializing fsnotify_mark_srcu");
+ fsnotify_mark_connector_cachep = KMEM_CACHE(fsnotify_mark_connector,
+ SLAB_PANIC);
+
return 0;
}
core_initcall(fsnotify_init);
diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
index 0a3bc2cf192c..bf012e8ecd14 100644
--- a/fs/notify/fsnotify.h
+++ b/fs/notify/fsnotify.h
@@ -8,60 +8,36 @@
#include "../mount.h"
+struct fsnotify_iter_info {
+ struct fsnotify_mark *inode_mark;
+ struct fsnotify_mark *vfsmount_mark;
+ int srcu_idx;
+};
+
/* destroy all events sitting in this groups notification queue */
extern void fsnotify_flush_notify(struct fsnotify_group *group);
/* protects reads of inode and vfsmount marks list */
extern struct srcu_struct fsnotify_mark_srcu;
-/* Calculate mask of events for a list of marks */
-extern u32 fsnotify_recalc_mask(struct hlist_head *head);
-
/* compare two groups for sorting of marks lists */
extern int fsnotify_compare_groups(struct fsnotify_group *a,
struct fsnotify_group *b);
-extern void fsnotify_set_inode_mark_mask_locked(struct fsnotify_mark *fsn_mark,
- __u32 mask);
-/* Add mark to a proper place in mark list */
-extern int fsnotify_add_mark_list(struct hlist_head *head,
- struct fsnotify_mark *mark,
- int allow_dups);
-/* add a mark to an inode */
-extern int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
- struct fsnotify_group *group, struct inode *inode,
- int allow_dups);
-/* add a mark to a vfsmount */
-extern int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark,
- struct fsnotify_group *group, struct vfsmount *mnt,
- int allow_dups);
-
-/* vfsmount specific destruction of a mark */
-extern void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark);
-/* inode specific destruction of a mark */
-extern void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark);
-/* Find mark belonging to given group in the list of marks */
-extern struct fsnotify_mark *fsnotify_find_mark(struct hlist_head *head,
- struct fsnotify_group *group);
-/* Destroy all marks in the given list protected by 'lock' */
-extern void fsnotify_destroy_marks(struct hlist_head *head, spinlock_t *lock);
+/* Destroy all marks connected via given connector */
+extern void fsnotify_destroy_marks(struct fsnotify_mark_connector __rcu **connp);
/* run the list of all marks associated with inode and destroy them */
static inline void fsnotify_clear_marks_by_inode(struct inode *inode)
{
- fsnotify_destroy_marks(&inode->i_fsnotify_marks, &inode->i_lock);
+ fsnotify_destroy_marks(&inode->i_fsnotify_marks);
}
/* run the list of all marks associated with vfsmount and destroy them */
static inline void fsnotify_clear_marks_by_mount(struct vfsmount *mnt)
{
- fsnotify_destroy_marks(&real_mount(mnt)->mnt_fsnotify_marks,
- &mnt->mnt_root->d_lock);
+ fsnotify_destroy_marks(&real_mount(mnt)->mnt_fsnotify_marks);
}
-/* prepare for freeing all marks associated with given group */
-extern void fsnotify_detach_group_marks(struct fsnotify_group *group);
-/*
- * wait for fsnotify_mark_srcu period to end and free all marks in destroy_list
- */
-extern void fsnotify_mark_destroy_list(void);
+/* Wait until all marks queued for destruction are destroyed */
+extern void fsnotify_wait_marks_destroyed(void);
/*
* update the dentry->d_flags of all of inode's children to indicate if inode cares
diff --git a/fs/notify/group.c b/fs/notify/group.c
index fbe3cbebec16..32357534de18 100644
--- a/fs/notify/group.c
+++ b/fs/notify/group.c
@@ -66,14 +66,23 @@ void fsnotify_destroy_group(struct fsnotify_group *group)
*/
fsnotify_group_stop_queueing(group);
- /* clear all inode marks for this group, attach them to destroy_list */
- fsnotify_detach_group_marks(group);
+ /* Clear all marks for this group and queue them for destruction */
+ fsnotify_clear_marks_by_group(group, FSNOTIFY_OBJ_ALL_TYPES);
/*
- * Wait for fsnotify_mark_srcu period to end and free all marks in
- * destroy_list
+ * Some marks can still be pinned when waiting for response from
+ * userspace. Wait for those now. fsnotify_prepare_user_wait() will
+ * not succeed now so this wait is race-free.
*/
- fsnotify_mark_destroy_list();
+ wait_event(group->notification_waitq, !atomic_read(&group->user_waits));
+
+ /*
+ * Wait until all marks get really destroyed. We could actually destroy
+ * them ourselves instead of waiting for worker to do it, however that
+ * would be racy as worker can already be processing some marks before
+ * we even entered fsnotify_destroy_group().
+ */
+ fsnotify_wait_marks_destroyed();
/*
* Since we have waited for fsnotify_mark_srcu in
@@ -124,6 +133,7 @@ struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops)
/* set to 0 when there a no external references to this group */
atomic_set(&group->refcnt, 1);
atomic_set(&group->num_marks, 0);
+ atomic_set(&group->user_waits, 0);
spin_lock_init(&group->notification_lock);
INIT_LIST_HEAD(&group->notification_list);
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
deleted file mode 100644
index a3645249f7ec..000000000000
--- a/fs/notify/inode_mark.c
+++ /dev/null
@@ -1,199 +0,0 @@
-/*
- * Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2, or (at your option)
- * any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; see the file COPYING. If not, write to
- * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-#include <linux/fs.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/mutex.h>
-#include <linux/spinlock.h>
-
-#include <linux/atomic.h>
-
-#include <linux/fsnotify_backend.h>
-#include "fsnotify.h"
-
-#include "../internal.h"
-
-/*
- * Recalculate the inode->i_fsnotify_mask, or the mask of all FS_* event types
- * any notifier is interested in hearing for this inode.
- */
-void fsnotify_recalc_inode_mask(struct inode *inode)
-{
- spin_lock(&inode->i_lock);
- inode->i_fsnotify_mask = fsnotify_recalc_mask(&inode->i_fsnotify_marks);
- spin_unlock(&inode->i_lock);
-
- __fsnotify_update_child_dentry_flags(inode);
-}
-
-void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark)
-{
- struct inode *inode = mark->inode;
-
- BUG_ON(!mutex_is_locked(&mark->group->mark_mutex));
- assert_spin_locked(&mark->lock);
-
- spin_lock(&inode->i_lock);
-
- hlist_del_init_rcu(&mark->obj_list);
- mark->inode = NULL;
-
- /*
- * this mark is now off the inode->i_fsnotify_marks list and we
- * hold the inode->i_lock, so this is the perfect time to update the
- * inode->i_fsnotify_mask
- */
- inode->i_fsnotify_mask = fsnotify_recalc_mask(&inode->i_fsnotify_marks);
- spin_unlock(&inode->i_lock);
-}
-
-/*
- * Given a group clear all of the inode marks associated with that group.
- */
-void fsnotify_clear_inode_marks_by_group(struct fsnotify_group *group)
-{
- fsnotify_clear_marks_by_group_flags(group, FSNOTIFY_MARK_FLAG_INODE);
-}
-
-/*
- * given a group and inode, find the mark associated with that combination.
- * if found take a reference to that mark and return it, else return NULL
- */
-struct fsnotify_mark *fsnotify_find_inode_mark(struct fsnotify_group *group,
- struct inode *inode)
-{
- struct fsnotify_mark *mark;
-
- spin_lock(&inode->i_lock);
- mark = fsnotify_find_mark(&inode->i_fsnotify_marks, group);
- spin_unlock(&inode->i_lock);
-
- return mark;
-}
-
-/*
- * If we are setting a mark mask on an inode mark we should pin the inode
- * in memory.
- */
-void fsnotify_set_inode_mark_mask_locked(struct fsnotify_mark *mark,
- __u32 mask)
-{
- struct inode *inode;
-
- assert_spin_locked(&mark->lock);
-
- if (mask &&
- mark->inode &&
- !(mark->flags & FSNOTIFY_MARK_FLAG_OBJECT_PINNED)) {
- mark->flags |= FSNOTIFY_MARK_FLAG_OBJECT_PINNED;
- inode = igrab(mark->inode);
- /*
- * we shouldn't be able to get here if the inode wasn't
- * already safely held in memory. But bug in case it
- * ever is wrong.
- */
- BUG_ON(!inode);
- }
-}
-
-/*
- * Attach an initialized mark to a given inode.
- * These marks may be used for the fsnotify backend to determine which
- * event types should be delivered to which group and for which inodes. These
- * marks are ordered according to priority, highest number first, and then by
- * the group's location in memory.
- */
-int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
- struct fsnotify_group *group, struct inode *inode,
- int allow_dups)
-{
- int ret;
-
- mark->flags |= FSNOTIFY_MARK_FLAG_INODE;
-
- BUG_ON(!mutex_is_locked(&group->mark_mutex));
- assert_spin_locked(&mark->lock);
-
- spin_lock(&inode->i_lock);
- mark->inode = inode;
- ret = fsnotify_add_mark_list(&inode->i_fsnotify_marks, mark,
- allow_dups);
- inode->i_fsnotify_mask = fsnotify_recalc_mask(&inode->i_fsnotify_marks);
- spin_unlock(&inode->i_lock);
-
- return ret;
-}
-
-/**
- * fsnotify_unmount_inodes - an sb is unmounting. handle any watched inodes.
- * @sb: superblock being unmounted.
- *
- * Called during unmount with no locks held, so needs to be safe against
- * concurrent modifiers. We temporarily drop sb->s_inode_list_lock and CAN block.
- */
-void fsnotify_unmount_inodes(struct super_block *sb)
-{
- struct inode *inode, *iput_inode = NULL;
-
- spin_lock(&sb->s_inode_list_lock);
- list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
- /*
- * We cannot __iget() an inode in state I_FREEING,
- * I_WILL_FREE, or I_NEW which is fine because by that point
- * the inode cannot have any associated watches.
- */
- spin_lock(&inode->i_lock);
- if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) {
- spin_unlock(&inode->i_lock);
- continue;
- }
-
- /*
- * If i_count is zero, the inode cannot have any watches and
- * doing an __iget/iput with MS_ACTIVE clear would actually
- * evict all inodes with zero i_count from icache which is
- * unnecessarily violent and may in fact be illegal to do.
- */
- if (!atomic_read(&inode->i_count)) {
- spin_unlock(&inode->i_lock);
- continue;
- }
-
- __iget(inode);
- spin_unlock(&inode->i_lock);
- spin_unlock(&sb->s_inode_list_lock);
-
- if (iput_inode)
- iput(iput_inode);
-
- /* for each watch, send FS_UNMOUNT and then remove it */
- fsnotify(inode, FS_UNMOUNT, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
-
- fsnotify_inode_delete(inode);
-
- iput_inode = inode;
-
- spin_lock(&sb->s_inode_list_lock);
- }
- spin_unlock(&sb->s_inode_list_lock);
-
- if (iput_inode)
- iput(iput_inode);
-}
diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h
index 7c461fd49c4c..9ff67b61da8a 100644
--- a/fs/notify/inotify/inotify.h
+++ b/fs/notify/inotify/inotify.h
@@ -27,9 +27,11 @@ extern int inotify_handle_event(struct fsnotify_group *group,
struct fsnotify_mark *inode_mark,
struct fsnotify_mark *vfsmount_mark,
u32 mask, const void *data, int data_type,
- const unsigned char *file_name, u32 cookie);
+ const unsigned char *file_name, u32 cookie,
+ struct fsnotify_iter_info *iter_info);
extern const struct fsnotify_ops inotify_fsnotify_ops;
+extern struct kmem_cache *inotify_inode_mark_cachep;
#ifdef CONFIG_INOTIFY_USER
static inline void dec_inotify_instances(struct ucounts *ucounts)
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index 1aeb837ae414..8b73332735ba 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -68,7 +68,8 @@ int inotify_handle_event(struct fsnotify_group *group,
struct fsnotify_mark *inode_mark,
struct fsnotify_mark *vfsmount_mark,
u32 mask, const void *data, int data_type,
- const unsigned char *file_name, u32 cookie)
+ const unsigned char *file_name, u32 cookie,
+ struct fsnotify_iter_info *iter_info)
{
struct inotify_inode_mark *i_mark;
struct inotify_event_info *event;
@@ -156,8 +157,8 @@ static int idr_callback(int id, void *p, void *data)
* BUG() that was here.
*/
if (fsn_mark)
- printk(KERN_WARNING "fsn_mark->group=%p inode=%p wd=%d\n",
- fsn_mark->group, fsn_mark->inode, i_mark->wd);
+ printk(KERN_WARNING "fsn_mark->group=%p wd=%d\n",
+ fsn_mark->group, i_mark->wd);
return 0;
}
@@ -175,9 +176,20 @@ static void inotify_free_event(struct fsnotify_event *fsn_event)
kfree(INOTIFY_E(fsn_event));
}
+/* ding dong the mark is dead */
+static void inotify_free_mark(struct fsnotify_mark *fsn_mark)
+{
+ struct inotify_inode_mark *i_mark;
+
+ i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark);
+
+ kmem_cache_free(inotify_inode_mark_cachep, i_mark);
+}
+
const struct fsnotify_ops inotify_fsnotify_ops = {
.handle_event = inotify_handle_event,
.free_group_priv = inotify_free_group_priv,
.free_event = inotify_free_event,
.freeing_mark = inotify_freeing_mark,
+ .free_mark = inotify_free_mark,
};
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 498d609b26c7..7cc7d3fb1862 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -47,7 +47,7 @@
/* configurable via /proc/sys/fs/inotify/ */
static int inotify_max_queued_events __read_mostly;
-static struct kmem_cache *inotify_inode_mark_cachep __read_mostly;
+struct kmem_cache *inotify_inode_mark_cachep __read_mostly;
#ifdef CONFIG_SYSCTL
@@ -395,21 +395,6 @@ static struct inotify_inode_mark *inotify_idr_find(struct fsnotify_group *group,
return i_mark;
}
-static void do_inotify_remove_from_idr(struct fsnotify_group *group,
- struct inotify_inode_mark *i_mark)
-{
- struct idr *idr = &group->inotify_data.idr;
- spinlock_t *idr_lock = &group->inotify_data.idr_lock;
- int wd = i_mark->wd;
-
- assert_spin_locked(idr_lock);
-
- idr_remove(idr, wd);
-
- /* removed from the idr, drop that ref */
- fsnotify_put_mark(&i_mark->fsn_mark);
-}
-
/*
* Remove the mark from the idr (if present) and drop the reference
* on the mark because it was in the idr.
@@ -417,6 +402,7 @@ static void do_inotify_remove_from_idr(struct fsnotify_group *group,
static void inotify_remove_from_idr(struct fsnotify_group *group,
struct inotify_inode_mark *i_mark)
{
+ struct idr *idr = &group->inotify_data.idr;
spinlock_t *idr_lock = &group->inotify_data.idr_lock;
struct inotify_inode_mark *found_i_mark = NULL;
int wd;
@@ -429,18 +415,16 @@ static void inotify_remove_from_idr(struct fsnotify_group *group,
* if it wasn't....
*/
if (wd == -1) {
- WARN_ONCE(1, "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p"
- " i_mark->inode=%p\n", __func__, i_mark, i_mark->wd,
- i_mark->fsn_mark.group, i_mark->fsn_mark.inode);
+ WARN_ONCE(1, "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p\n",
+ __func__, i_mark, i_mark->wd, i_mark->fsn_mark.group);
goto out;
}
/* Lets look in the idr to see if we find it */
found_i_mark = inotify_idr_find_locked(group, wd);
if (unlikely(!found_i_mark)) {
- WARN_ONCE(1, "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p"
- " i_mark->inode=%p\n", __func__, i_mark, i_mark->wd,
- i_mark->fsn_mark.group, i_mark->fsn_mark.inode);
+ WARN_ONCE(1, "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p\n",
+ __func__, i_mark, i_mark->wd, i_mark->fsn_mark.group);
goto out;
}
@@ -451,35 +435,33 @@ static void inotify_remove_from_idr(struct fsnotify_group *group,
*/
if (unlikely(found_i_mark != i_mark)) {
WARN_ONCE(1, "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p "
- "mark->inode=%p found_i_mark=%p found_i_mark->wd=%d "
- "found_i_mark->group=%p found_i_mark->inode=%p\n",
- __func__, i_mark, i_mark->wd, i_mark->fsn_mark.group,
- i_mark->fsn_mark.inode, found_i_mark, found_i_mark->wd,
- found_i_mark->fsn_mark.group,
- found_i_mark->fsn_mark.inode);
+ "found_i_mark=%p found_i_mark->wd=%d "
+ "found_i_mark->group=%p\n", __func__, i_mark,
+ i_mark->wd, i_mark->fsn_mark.group, found_i_mark,
+ found_i_mark->wd, found_i_mark->fsn_mark.group);
goto out;
}
/*
* One ref for being in the idr
- * one ref held by the caller trying to kill us
* one ref grabbed by inotify_idr_find
*/
- if (unlikely(atomic_read(&i_mark->fsn_mark.refcnt) < 3)) {
- printk(KERN_ERR "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p"
- " i_mark->inode=%p\n", __func__, i_mark, i_mark->wd,
- i_mark->fsn_mark.group, i_mark->fsn_mark.inode);
+ if (unlikely(atomic_read(&i_mark->fsn_mark.refcnt) < 2)) {
+ printk(KERN_ERR "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p\n",
+ __func__, i_mark, i_mark->wd, i_mark->fsn_mark.group);
/* we can't really recover with bad ref cnting.. */
BUG();
}
- do_inotify_remove_from_idr(group, i_mark);
+ idr_remove(idr, wd);
+ /* Removed from the idr, drop that ref. */
+ fsnotify_put_mark(&i_mark->fsn_mark);
out:
+ i_mark->wd = -1;
+ spin_unlock(idr_lock);
/* match the ref taken by inotify_idr_find_locked() */
if (found_i_mark)
fsnotify_put_mark(&found_i_mark->fsn_mark);
- i_mark->wd = -1;
- spin_unlock(idr_lock);
}
/*
@@ -492,7 +474,7 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark,
/* Queue ignore event for the watch */
inotify_handle_event(group, NULL, fsn_mark, NULL, FS_IN_IGNORED,
- NULL, FSNOTIFY_EVENT_NONE, NULL, 0);
+ NULL, FSNOTIFY_EVENT_NONE, NULL, 0, NULL);
i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark);
/* remove this mark from the idr */
@@ -501,16 +483,6 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark,
dec_inotify_watches(group->inotify_data.ucounts);
}
-/* ding dong the mark is dead */
-static void inotify_free_mark(struct fsnotify_mark *fsn_mark)
-{
- struct inotify_inode_mark *i_mark;
-
- i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark);
-
- kmem_cache_free(inotify_inode_mark_cachep, i_mark);
-}
-
static int inotify_update_existing_watch(struct fsnotify_group *group,
struct inode *inode,
u32 arg)
@@ -524,21 +496,19 @@ static int inotify_update_existing_watch(struct fsnotify_group *group,
mask = inotify_arg_to_mask(arg);
- fsn_mark = fsnotify_find_inode_mark(group, inode);
+ fsn_mark = fsnotify_find_mark(&inode->i_fsnotify_marks, group);
if (!fsn_mark)
return -ENOENT;
i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark);
spin_lock(&fsn_mark->lock);
-
old_mask = fsn_mark->mask;
if (add)
- fsnotify_set_mark_mask_locked(fsn_mark, (fsn_mark->mask | mask));
+ fsn_mark->mask |= mask;
else
- fsnotify_set_mark_mask_locked(fsn_mark, mask);
+ fsn_mark->mask = mask;
new_mask = fsn_mark->mask;
-
spin_unlock(&fsn_mark->lock);
if (old_mask != new_mask) {
@@ -549,7 +519,7 @@ static int inotify_update_existing_watch(struct fsnotify_group *group,
/* update the inode with this new fsn_mark */
if (dropped || do_inode)
- fsnotify_recalc_inode_mask(inode);
+ fsnotify_recalc_mask(inode->i_fsnotify_marks);
}
@@ -578,7 +548,7 @@ static int inotify_new_watch(struct fsnotify_group *group,
if (unlikely(!tmp_i_mark))
return -ENOMEM;
- fsnotify_init_mark(&tmp_i_mark->fsn_mark, inotify_free_mark);
+ fsnotify_init_mark(&tmp_i_mark->fsn_mark, group);
tmp_i_mark->fsn_mark.mask = mask;
tmp_i_mark->wd = -1;
@@ -594,8 +564,7 @@ static int inotify_new_watch(struct fsnotify_group *group,
}
/* we are on the idr, now get on the inode */
- ret = fsnotify_add_mark_locked(&tmp_i_mark->fsn_mark, group, inode,
- NULL, 0);
+ ret = fsnotify_add_mark_locked(&tmp_i_mark->fsn_mark, inode, NULL, 0);
if (ret) {
/* we failed to get on the inode, get off the idr */
inotify_remove_from_idr(group, tmp_i_mark);
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index 6043306e8e21..9991f8826734 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -33,7 +33,7 @@
*
* group->mark_mutex
* mark->lock
- * inode->i_lock
+ * mark->connector->lock
*
* group->mark_mutex protects the marks_list anchored inside a given group and
* each mark is hooked via the g_list. It also protects the groups private
@@ -44,14 +44,22 @@
* is assigned to as well as the access to a reference of the inode/vfsmount
* that is being watched by the mark.
*
- * inode->i_lock protects the i_fsnotify_marks list anchored inside a
- * given inode and each mark is hooked via the i_list. (and sorta the
- * free_i_list)
+ * mark->connector->lock protects the list of marks anchored inside an
+ * inode / vfsmount and each mark is hooked via the i_list.
*
+ * A list of notification marks relating to inode / mnt is contained in
+ * fsnotify_mark_connector. That structure is alive as long as there are any
+ * marks in the list and is also protected by fsnotify_mark_srcu. A mark gets
+ * detached from fsnotify_mark_connector when last reference to the mark is
+ * dropped. Thus having mark reference is enough to protect mark->connector
+ * pointer and to make sure fsnotify_mark_connector cannot disappear. Also
+ * because we remove mark from g_list before dropping mark reference associated
+ * with that, any mark found through g_list is guaranteed to have
+ * mark->connector set until we drop group->mark_mutex.
*
* LIFETIME:
* Inode marks survive between when they are added to an inode and when their
- * refcnt==0.
+ * refcnt==0. Marks are also protected by fsnotify_mark_srcu.
*
* The inode mark can be cleared for a number of different reasons including:
* - The inode is unlinked for the last time. (fsnotify_inode_remove)
@@ -61,17 +69,6 @@
* - The fsnotify_group associated with the mark is going away and all such marks
* need to be cleaned up. (fsnotify_clear_marks_by_group)
*
- * Worst case we are given an inode and need to clean up all the marks on that
- * inode. We take i_lock and walk the i_fsnotify_marks safely. For each
- * mark on the list we take a reference (so the mark can't disappear under us).
- * We remove that mark form the inode's list of marks and we add this mark to a
- * private list anchored on the stack using i_free_list; we walk i_free_list
- * and before we destroy the mark we make sure that we dont race with a
- * concurrent destroy_group by getting a ref to the marks group and taking the
- * groups mutex.
-
- * Very similarly for freeing by group, except we use free_g_list.
- *
* This has the very interesting property of being able to run concurrently with
* any (or all) other directions.
*/
@@ -94,94 +91,281 @@
#define FSNOTIFY_REAPER_DELAY (1) /* 1 jiffy */
struct srcu_struct fsnotify_mark_srcu;
+struct kmem_cache *fsnotify_mark_connector_cachep;
+
static DEFINE_SPINLOCK(destroy_lock);
static LIST_HEAD(destroy_list);
+static struct fsnotify_mark_connector *connector_destroy_list;
static void fsnotify_mark_destroy_workfn(struct work_struct *work);
static DECLARE_DELAYED_WORK(reaper_work, fsnotify_mark_destroy_workfn);
+static void fsnotify_connector_destroy_workfn(struct work_struct *work);
+static DECLARE_WORK(connector_reaper_work, fsnotify_connector_destroy_workfn);
+
void fsnotify_get_mark(struct fsnotify_mark *mark)
{
+ WARN_ON_ONCE(!atomic_read(&mark->refcnt));
atomic_inc(&mark->refcnt);
}
-void fsnotify_put_mark(struct fsnotify_mark *mark)
+/*
+ * Get mark reference when we found the mark via lockless traversal of object
+ * list. Mark can be already removed from the list by now and on its way to be
+ * destroyed once SRCU period ends.
+ */
+static bool fsnotify_get_mark_safe(struct fsnotify_mark *mark)
{
- if (atomic_dec_and_test(&mark->refcnt)) {
- if (mark->group)
- fsnotify_put_group(mark->group);
- mark->free_mark(mark);
- }
+ return atomic_inc_not_zero(&mark->refcnt);
}
-/* Calculate mask of events for a list of marks */
-u32 fsnotify_recalc_mask(struct hlist_head *head)
+static void __fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
{
u32 new_mask = 0;
struct fsnotify_mark *mark;
- hlist_for_each_entry(mark, head, obj_list)
- new_mask |= mark->mask;
- return new_mask;
+ assert_spin_locked(&conn->lock);
+ hlist_for_each_entry(mark, &conn->list, obj_list) {
+ if (mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)
+ new_mask |= mark->mask;
+ }
+ if (conn->flags & FSNOTIFY_OBJ_TYPE_INODE)
+ conn->inode->i_fsnotify_mask = new_mask;
+ else if (conn->flags & FSNOTIFY_OBJ_TYPE_VFSMOUNT)
+ real_mount(conn->mnt)->mnt_fsnotify_mask = new_mask;
}
/*
- * Remove mark from inode / vfsmount list, group list, drop inode reference
- * if we got one.
- *
- * Must be called with group->mark_mutex held.
+ * Calculate mask of events for a list of marks. The caller must make sure
+ * connector and connector->inode cannot disappear under us. Callers achieve
+ * this by holding a mark->lock or mark->group->mark_mutex for a mark on this
+ * list.
*/
-void fsnotify_detach_mark(struct fsnotify_mark *mark)
+void fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
+{
+ if (!conn)
+ return;
+
+ spin_lock(&conn->lock);
+ __fsnotify_recalc_mask(conn);
+ spin_unlock(&conn->lock);
+ if (conn->flags & FSNOTIFY_OBJ_TYPE_INODE)
+ __fsnotify_update_child_dentry_flags(conn->inode);
+}
+
+/* Free all connectors queued for freeing once SRCU period ends */
+static void fsnotify_connector_destroy_workfn(struct work_struct *work)
+{
+ struct fsnotify_mark_connector *conn, *free;
+
+ spin_lock(&destroy_lock);
+ conn = connector_destroy_list;
+ connector_destroy_list = NULL;
+ spin_unlock(&destroy_lock);
+
+ synchronize_srcu(&fsnotify_mark_srcu);
+ while (conn) {
+ free = conn;
+ conn = conn->destroy_next;
+ kmem_cache_free(fsnotify_mark_connector_cachep, free);
+ }
+}
+
+static struct inode *fsnotify_detach_connector_from_object(
+ struct fsnotify_mark_connector *conn)
{
struct inode *inode = NULL;
+
+ if (conn->flags & FSNOTIFY_OBJ_TYPE_INODE) {
+ inode = conn->inode;
+ rcu_assign_pointer(inode->i_fsnotify_marks, NULL);
+ inode->i_fsnotify_mask = 0;
+ conn->inode = NULL;
+ conn->flags &= ~FSNOTIFY_OBJ_TYPE_INODE;
+ } else if (conn->flags & FSNOTIFY_OBJ_TYPE_VFSMOUNT) {
+ rcu_assign_pointer(real_mount(conn->mnt)->mnt_fsnotify_marks,
+ NULL);
+ real_mount(conn->mnt)->mnt_fsnotify_mask = 0;
+ conn->mnt = NULL;
+ conn->flags &= ~FSNOTIFY_OBJ_TYPE_VFSMOUNT;
+ }
+
+ return inode;
+}
+
+static void fsnotify_final_mark_destroy(struct fsnotify_mark *mark)
+{
struct fsnotify_group *group = mark->group;
- BUG_ON(!mutex_is_locked(&group->mark_mutex));
+ if (WARN_ON_ONCE(!group))
+ return;
+ group->ops->free_mark(mark);
+ fsnotify_put_group(group);
+}
- spin_lock(&mark->lock);
+void fsnotify_put_mark(struct fsnotify_mark *mark)
+{
+ struct fsnotify_mark_connector *conn;
+ struct inode *inode = NULL;
+ bool free_conn = false;
- /* something else already called this function on this mark */
- if (!(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) {
- spin_unlock(&mark->lock);
+ /* Catch marks that were actually never attached to object */
+ if (!mark->connector) {
+ if (atomic_dec_and_test(&mark->refcnt))
+ fsnotify_final_mark_destroy(mark);
return;
}
- mark->flags &= ~FSNOTIFY_MARK_FLAG_ATTACHED;
+ /*
+ * We have to be careful so that traversals of obj_list under lock can
+ * safely grab mark reference.
+ */
+ if (!atomic_dec_and_lock(&mark->refcnt, &mark->connector->lock))
+ return;
- if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) {
- inode = mark->inode;
- fsnotify_destroy_inode_mark(mark);
- } else if (mark->flags & FSNOTIFY_MARK_FLAG_VFSMOUNT)
- fsnotify_destroy_vfsmount_mark(mark);
- else
- BUG();
+ conn = mark->connector;
+ hlist_del_init_rcu(&mark->obj_list);
+ if (hlist_empty(&conn->list)) {
+ inode = fsnotify_detach_connector_from_object(conn);
+ free_conn = true;
+ } else {
+ __fsnotify_recalc_mask(conn);
+ }
+ mark->connector = NULL;
+ spin_unlock(&conn->lock);
+
+ iput(inode);
+
+ if (free_conn) {
+ spin_lock(&destroy_lock);
+ conn->destroy_next = connector_destroy_list;
+ connector_destroy_list = conn;
+ spin_unlock(&destroy_lock);
+ queue_work(system_unbound_wq, &connector_reaper_work);
+ }
/*
* Note that we didn't update flags telling whether inode cares about
* what's happening with children. We update these flags from
* __fsnotify_parent() lazily when next event happens on one of our
* children.
*/
+ spin_lock(&destroy_lock);
+ list_add(&mark->g_list, &destroy_list);
+ spin_unlock(&destroy_lock);
+ queue_delayed_work(system_unbound_wq, &reaper_work,
+ FSNOTIFY_REAPER_DELAY);
+}
- list_del_init(&mark->g_list);
+bool fsnotify_prepare_user_wait(struct fsnotify_iter_info *iter_info)
+{
+ struct fsnotify_group *group;
- spin_unlock(&mark->lock);
+ if (WARN_ON_ONCE(!iter_info->inode_mark && !iter_info->vfsmount_mark))
+ return false;
+
+ if (iter_info->inode_mark)
+ group = iter_info->inode_mark->group;
+ else
+ group = iter_info->vfsmount_mark->group;
+
+ /*
+ * Since acquisition of mark reference is an atomic op as well, we can
+ * be sure this inc is seen before any effect of refcount increment.
+ */
+ atomic_inc(&group->user_waits);
+
+ if (iter_info->inode_mark) {
+ /* This can fail if mark is being removed */
+ if (!fsnotify_get_mark_safe(iter_info->inode_mark))
+ goto out_wait;
+ }
+ if (iter_info->vfsmount_mark) {
+ if (!fsnotify_get_mark_safe(iter_info->vfsmount_mark))
+ goto out_inode;
+ }
- if (inode && (mark->flags & FSNOTIFY_MARK_FLAG_OBJECT_PINNED))
- iput(inode);
+ /*
+ * Now that both marks are pinned by refcount in the inode / vfsmount
+ * lists, we can drop SRCU lock, and safely resume the list iteration
+ * once userspace returns.
+ */
+ srcu_read_unlock(&fsnotify_mark_srcu, iter_info->srcu_idx);
+
+ return true;
+out_inode:
+ if (iter_info->inode_mark)
+ fsnotify_put_mark(iter_info->inode_mark);
+out_wait:
+ if (atomic_dec_and_test(&group->user_waits) && group->shutdown)
+ wake_up(&group->notification_waitq);
+ return false;
+}
+
+void fsnotify_finish_user_wait(struct fsnotify_iter_info *iter_info)
+{
+ struct fsnotify_group *group = NULL;
+
+ iter_info->srcu_idx = srcu_read_lock(&fsnotify_mark_srcu);
+ if (iter_info->inode_mark) {
+ group = iter_info->inode_mark->group;
+ fsnotify_put_mark(iter_info->inode_mark);
+ }
+ if (iter_info->vfsmount_mark) {
+ group = iter_info->vfsmount_mark->group;
+ fsnotify_put_mark(iter_info->vfsmount_mark);
+ }
+ /*
+ * We abuse notification_waitq on group shutdown for waiting for all
+ * marks pinned when waiting for userspace.
+ */
+ if (atomic_dec_and_test(&group->user_waits) && group->shutdown)
+ wake_up(&group->notification_waitq);
+}
+
+/*
+ * Mark mark as detached, remove it from group list. Mark still stays in object
+ * list until its last reference is dropped. Note that we rely on mark being
+ * removed from group list before corresponding reference to it is dropped. In
+ * particular we rely on mark->connector being valid while we hold
+ * group->mark_mutex if we found the mark through g_list.
+ *
+ * Must be called with group->mark_mutex held. The caller must either hold
+ * reference to the mark or be protected by fsnotify_mark_srcu.
+ */
+void fsnotify_detach_mark(struct fsnotify_mark *mark)
+{
+ struct fsnotify_group *group = mark->group;
+
+ WARN_ON_ONCE(!mutex_is_locked(&group->mark_mutex));
+ WARN_ON_ONCE(!srcu_read_lock_held(&fsnotify_mark_srcu) &&
+ atomic_read(&mark->refcnt) < 1 +
+ !!(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED));
+
+ spin_lock(&mark->lock);
+ /* something else already called this function on this mark */
+ if (!(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) {
+ spin_unlock(&mark->lock);
+ return;
+ }
+ mark->flags &= ~FSNOTIFY_MARK_FLAG_ATTACHED;
+ list_del_init(&mark->g_list);
+ spin_unlock(&mark->lock);
atomic_dec(&group->num_marks);
+
+ /* Drop mark reference acquired in fsnotify_add_mark_locked() */
+ fsnotify_put_mark(mark);
}
/*
- * Prepare mark for freeing and add it to the list of marks prepared for
- * freeing. The actual freeing must happen after SRCU period ends and the
- * caller is responsible for this.
+ * Free fsnotify mark. The mark is actually only marked as being freed. The
+ * freeing is actually happening only once last reference to the mark is
+ * dropped from a workqueue which first waits for srcu period end.
*
- * The function returns true if the mark was added to the list of marks for
- * freeing. The function returns false if someone else has already called
- * __fsnotify_free_mark() for the mark.
+ * Caller must have a reference to the mark or be protected by
+ * fsnotify_mark_srcu.
*/
-static bool __fsnotify_free_mark(struct fsnotify_mark *mark)
+void fsnotify_free_mark(struct fsnotify_mark *mark)
{
struct fsnotify_group *group = mark->group;
@@ -189,7 +373,7 @@ static bool __fsnotify_free_mark(struct fsnotify_mark *mark)
/* something else already called this function on this mark */
if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE)) {
spin_unlock(&mark->lock);
- return false;
+ return;
}
mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE;
spin_unlock(&mark->lock);
@@ -201,25 +385,6 @@ static bool __fsnotify_free_mark(struct fsnotify_mark *mark)
*/
if (group->ops->freeing_mark)
group->ops->freeing_mark(mark, group);
-
- spin_lock(&destroy_lock);
- list_add(&mark->g_list, &destroy_list);
- spin_unlock(&destroy_lock);
-
- return true;
-}
-
-/*
- * Free fsnotify mark. The freeing is actually happening from a workqueue which
- * first waits for srcu period end. Caller must have a reference to the mark
- * or be protected by fsnotify_mark_srcu.
- */
-void fsnotify_free_mark(struct fsnotify_mark *mark)
-{
- if (__fsnotify_free_mark(mark)) {
- queue_delayed_work(system_unbound_wq, &reaper_work,
- FSNOTIFY_REAPER_DELAY);
- }
}
void fsnotify_destroy_mark(struct fsnotify_mark *mark,
@@ -231,54 +396,6 @@ void fsnotify_destroy_mark(struct fsnotify_mark *mark,
fsnotify_free_mark(mark);
}
-void fsnotify_destroy_marks(struct hlist_head *head, spinlock_t *lock)
-{
- struct fsnotify_mark *mark;
-
- while (1) {
- /*
- * We have to be careful since we can race with e.g.
- * fsnotify_clear_marks_by_group() and once we drop 'lock',
- * mark can get removed from the obj_list and destroyed. But
- * we are holding mark reference so mark cannot be freed and
- * calling fsnotify_destroy_mark() more than once is fine.
- */
- spin_lock(lock);
- if (hlist_empty(head)) {
- spin_unlock(lock);
- break;
- }
- mark = hlist_entry(head->first, struct fsnotify_mark, obj_list);
- /*
- * We don't update i_fsnotify_mask / mnt_fsnotify_mask here
- * since inode / mount is going away anyway. So just remove
- * mark from the list.
- */
- hlist_del_init_rcu(&mark->obj_list);
- fsnotify_get_mark(mark);
- spin_unlock(lock);
- fsnotify_destroy_mark(mark, mark->group);
- fsnotify_put_mark(mark);
- }
-}
-
-void fsnotify_set_mark_mask_locked(struct fsnotify_mark *mark, __u32 mask)
-{
- assert_spin_locked(&mark->lock);
-
- mark->mask = mask;
-
- if (mark->flags & FSNOTIFY_MARK_FLAG_INODE)
- fsnotify_set_inode_mark_mask_locked(mark, mask);
-}
-
-void fsnotify_set_mark_ignored_mask_locked(struct fsnotify_mark *mark, __u32 mask)
-{
- assert_spin_locked(&mark->lock);
-
- mark->ignored_mask = mask;
-}
-
/*
* Sorting function for lists of fsnotify marks.
*
@@ -315,37 +432,133 @@ int fsnotify_compare_groups(struct fsnotify_group *a, struct fsnotify_group *b)
return -1;
}
-/* Add mark into proper place in given list of marks */
-int fsnotify_add_mark_list(struct hlist_head *head, struct fsnotify_mark *mark,
- int allow_dups)
+static int fsnotify_attach_connector_to_object(
+ struct fsnotify_mark_connector __rcu **connp,
+ struct inode *inode,
+ struct vfsmount *mnt)
+{
+ struct fsnotify_mark_connector *conn;
+
+ conn = kmem_cache_alloc(fsnotify_mark_connector_cachep, GFP_KERNEL);
+ if (!conn)
+ return -ENOMEM;
+ spin_lock_init(&conn->lock);
+ INIT_HLIST_HEAD(&conn->list);
+ if (inode) {
+ conn->flags = FSNOTIFY_OBJ_TYPE_INODE;
+ conn->inode = igrab(inode);
+ } else {
+ conn->flags = FSNOTIFY_OBJ_TYPE_VFSMOUNT;
+ conn->mnt = mnt;
+ }
+ /*
+ * cmpxchg() provides the barrier so that readers of *connp can see
+ * only initialized structure
+ */
+ if (cmpxchg(connp, NULL, conn)) {
+ /* Someone else created list structure for us */
+ if (inode)
+ iput(inode);
+ kmem_cache_free(fsnotify_mark_connector_cachep, conn);
+ }
+
+ return 0;
+}
+
+/*
+ * Get mark connector, make sure it is alive and return with its lock held.
+ * This is for users that get connector pointer from inode or mount. Users that
+ * hold reference to a mark on the list may directly lock connector->lock as
+ * they are sure list cannot go away under them.
+ */
+static struct fsnotify_mark_connector *fsnotify_grab_connector(
+ struct fsnotify_mark_connector __rcu **connp)
+{
+ struct fsnotify_mark_connector *conn;
+ int idx;
+
+ idx = srcu_read_lock(&fsnotify_mark_srcu);
+ conn = srcu_dereference(*connp, &fsnotify_mark_srcu);
+ if (!conn)
+ goto out;
+ spin_lock(&conn->lock);
+ if (!(conn->flags & (FSNOTIFY_OBJ_TYPE_INODE |
+ FSNOTIFY_OBJ_TYPE_VFSMOUNT))) {
+ spin_unlock(&conn->lock);
+ srcu_read_unlock(&fsnotify_mark_srcu, idx);
+ return NULL;
+ }
+out:
+ srcu_read_unlock(&fsnotify_mark_srcu, idx);
+ return conn;
+}
+
+/*
+ * Add mark into proper place in given list of marks. These marks may be used
+ * for the fsnotify backend to determine which event types should be delivered
+ * to which group and for which inodes. These marks are ordered according to
+ * priority, highest number first, and then by the group's location in memory.
+ */
+static int fsnotify_add_mark_list(struct fsnotify_mark *mark,
+ struct inode *inode, struct vfsmount *mnt,
+ int allow_dups)
{
struct fsnotify_mark *lmark, *last = NULL;
+ struct fsnotify_mark_connector *conn;
+ struct fsnotify_mark_connector __rcu **connp;
int cmp;
+ int err = 0;
+
+ if (WARN_ON(!inode && !mnt))
+ return -EINVAL;
+ if (inode)
+ connp = &inode->i_fsnotify_marks;
+ else
+ connp = &real_mount(mnt)->mnt_fsnotify_marks;
+restart:
+ spin_lock(&mark->lock);
+ conn = fsnotify_grab_connector(connp);
+ if (!conn) {
+ spin_unlock(&mark->lock);
+ err = fsnotify_attach_connector_to_object(connp, inode, mnt);
+ if (err)
+ return err;
+ goto restart;
+ }
/* is mark the first mark? */
- if (hlist_empty(head)) {
- hlist_add_head_rcu(&mark->obj_list, head);
- return 0;
+ if (hlist_empty(&conn->list)) {
+ hlist_add_head_rcu(&mark->obj_list, &conn->list);
+ goto added;
}
/* should mark be in the middle of the current list? */
- hlist_for_each_entry(lmark, head, obj_list) {
+ hlist_for_each_entry(lmark, &conn->list, obj_list) {
last = lmark;
- if ((lmark->group == mark->group) && !allow_dups)
- return -EEXIST;
+ if ((lmark->group == mark->group) &&
+ (lmark->flags & FSNOTIFY_MARK_FLAG_ATTACHED) &&
+ !allow_dups) {
+ err = -EEXIST;
+ goto out_err;
+ }
cmp = fsnotify_compare_groups(lmark->group, mark->group);
if (cmp >= 0) {
hlist_add_before_rcu(&mark->obj_list, &lmark->obj_list);
- return 0;
+ goto added;
}
}
BUG_ON(last == NULL);
/* mark should be the last entry. last is the current last entry */
hlist_add_behind_rcu(&mark->obj_list, &last->obj_list);
- return 0;
+added:
+ mark->connector = conn;
+out_err:
+ spin_unlock(&conn->lock);
+ spin_unlock(&mark->lock);
+ return err;
}
/*
@@ -353,10 +566,10 @@ int fsnotify_add_mark_list(struct hlist_head *head, struct fsnotify_mark *mark,
* These marks may be used for the fsnotify backend to determine which
* event types should be delivered to which group.
*/
-int fsnotify_add_mark_locked(struct fsnotify_mark *mark,
- struct fsnotify_group *group, struct inode *inode,
+int fsnotify_add_mark_locked(struct fsnotify_mark *mark, struct inode *inode,
struct vfsmount *mnt, int allow_dups)
{
+ struct fsnotify_group *group = mark->group;
int ret = 0;
BUG_ON(inode && mnt);
@@ -367,61 +580,42 @@ int fsnotify_add_mark_locked(struct fsnotify_mark *mark,
* LOCKING ORDER!!!!
* group->mark_mutex
* mark->lock
- * inode->i_lock
+ * mark->connector->lock
*/
spin_lock(&mark->lock);
mark->flags |= FSNOTIFY_MARK_FLAG_ALIVE | FSNOTIFY_MARK_FLAG_ATTACHED;
- fsnotify_get_group(group);
- mark->group = group;
list_add(&mark->g_list, &group->marks_list);
atomic_inc(&group->num_marks);
- fsnotify_get_mark(mark); /* for i_list and g_list */
-
- if (inode) {
- ret = fsnotify_add_inode_mark(mark, group, inode, allow_dups);
- if (ret)
- goto err;
- } else if (mnt) {
- ret = fsnotify_add_vfsmount_mark(mark, group, mnt, allow_dups);
- if (ret)
- goto err;
- } else {
- BUG();
- }
-
- /* this will pin the object if appropriate */
- fsnotify_set_mark_mask_locked(mark, mark->mask);
+ fsnotify_get_mark(mark); /* for g_list */
spin_unlock(&mark->lock);
- if (inode)
- __fsnotify_update_child_dentry_flags(inode);
+ ret = fsnotify_add_mark_list(mark, inode, mnt, allow_dups);
+ if (ret)
+ goto err;
+
+ if (mark->mask)
+ fsnotify_recalc_mask(mark->connector);
return ret;
err:
- mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE;
+ mark->flags &= ~(FSNOTIFY_MARK_FLAG_ALIVE |
+ FSNOTIFY_MARK_FLAG_ATTACHED);
list_del_init(&mark->g_list);
- fsnotify_put_group(group);
- mark->group = NULL;
atomic_dec(&group->num_marks);
- spin_unlock(&mark->lock);
-
- spin_lock(&destroy_lock);
- list_add(&mark->g_list, &destroy_list);
- spin_unlock(&destroy_lock);
- queue_delayed_work(system_unbound_wq, &reaper_work,
- FSNOTIFY_REAPER_DELAY);
-
+ fsnotify_put_mark(mark);
return ret;
}
-int fsnotify_add_mark(struct fsnotify_mark *mark, struct fsnotify_group *group,
- struct inode *inode, struct vfsmount *mnt, int allow_dups)
+int fsnotify_add_mark(struct fsnotify_mark *mark, struct inode *inode,
+ struct vfsmount *mnt, int allow_dups)
{
int ret;
+ struct fsnotify_group *group = mark->group;
+
mutex_lock(&group->mark_mutex);
- ret = fsnotify_add_mark_locked(mark, group, inode, mnt, allow_dups);
+ ret = fsnotify_add_mark_locked(mark, inode, mnt, allow_dups);
mutex_unlock(&group->mark_mutex);
return ret;
}
@@ -430,29 +624,42 @@ int fsnotify_add_mark(struct fsnotify_mark *mark, struct fsnotify_group *group,
* Given a list of marks, find the mark associated with given group. If found
* take a reference to that mark and return it, else return NULL.
*/
-struct fsnotify_mark *fsnotify_find_mark(struct hlist_head *head,
- struct fsnotify_group *group)
+struct fsnotify_mark *fsnotify_find_mark(
+ struct fsnotify_mark_connector __rcu **connp,
+ struct fsnotify_group *group)
{
+ struct fsnotify_mark_connector *conn;
struct fsnotify_mark *mark;
- hlist_for_each_entry(mark, head, obj_list) {
- if (mark->group == group) {
+ conn = fsnotify_grab_connector(connp);
+ if (!conn)
+ return NULL;
+
+ hlist_for_each_entry(mark, &conn->list, obj_list) {
+ if (mark->group == group &&
+ (mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) {
fsnotify_get_mark(mark);
+ spin_unlock(&conn->lock);
return mark;
}
}
+ spin_unlock(&conn->lock);
return NULL;
}
-/*
- * clear any marks in a group in which mark->flags & flags is true
- */
-void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group,
- unsigned int flags)
+/* Clear any marks in a group with given type */
+void fsnotify_clear_marks_by_group(struct fsnotify_group *group,
+ unsigned int type)
{
struct fsnotify_mark *lmark, *mark;
LIST_HEAD(to_free);
+ struct list_head *head = &to_free;
+ /* Skip selection step if we want to clear all marks. */
+ if (type == FSNOTIFY_OBJ_ALL_TYPES) {
+ head = &group->marks_list;
+ goto clear;
+ }
/*
* We have to be really careful here. Anytime we drop mark_mutex, e.g.
* fsnotify_clear_marks_by_inode() can come and free marks. Even in our
@@ -464,18 +671,19 @@ void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group,
*/
mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING);
list_for_each_entry_safe(mark, lmark, &group->marks_list, g_list) {
- if (mark->flags & flags)
+ if (mark->connector->flags & type)
list_move(&mark->g_list, &to_free);
}
mutex_unlock(&group->mark_mutex);
+clear:
while (1) {
mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING);
- if (list_empty(&to_free)) {
+ if (list_empty(head)) {
mutex_unlock(&group->mark_mutex);
break;
}
- mark = list_first_entry(&to_free, struct fsnotify_mark, g_list);
+ mark = list_first_entry(head, struct fsnotify_mark, g_list);
fsnotify_get_mark(mark);
fsnotify_detach_mark(mark);
mutex_unlock(&group->mark_mutex);
@@ -484,49 +692,62 @@ void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group,
}
}
-/*
- * Given a group, prepare for freeing all the marks associated with that group.
- * The marks are attached to the list of marks prepared for destruction, the
- * caller is responsible for freeing marks in that list after SRCU period has
- * ended.
- */
-void fsnotify_detach_group_marks(struct fsnotify_group *group)
+/* Destroy all marks attached to inode / vfsmount */
+void fsnotify_destroy_marks(struct fsnotify_mark_connector __rcu **connp)
{
- struct fsnotify_mark *mark;
+ struct fsnotify_mark_connector *conn;
+ struct fsnotify_mark *mark, *old_mark = NULL;
+ struct inode *inode;
- while (1) {
- mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING);
- if (list_empty(&group->marks_list)) {
- mutex_unlock(&group->mark_mutex);
- break;
- }
- mark = list_first_entry(&group->marks_list,
- struct fsnotify_mark, g_list);
+ conn = fsnotify_grab_connector(connp);
+ if (!conn)
+ return;
+ /*
+ * We have to be careful since we can race with e.g.
+ * fsnotify_clear_marks_by_group() and once we drop the conn->lock, the
+ * list can get modified. However we are holding mark reference and
+ * thus our mark cannot be removed from obj_list so we can continue
+ * iteration after regaining conn->lock.
+ */
+ hlist_for_each_entry(mark, &conn->list, obj_list) {
fsnotify_get_mark(mark);
- fsnotify_detach_mark(mark);
- mutex_unlock(&group->mark_mutex);
- __fsnotify_free_mark(mark);
- fsnotify_put_mark(mark);
+ spin_unlock(&conn->lock);
+ if (old_mark)
+ fsnotify_put_mark(old_mark);
+ old_mark = mark;
+ fsnotify_destroy_mark(mark, mark->group);
+ spin_lock(&conn->lock);
}
+ /*
+ * Detach list from object now so that we don't pin inode until all
+ * mark references get dropped. It would lead to strange results such
+ * as delaying inode deletion or blocking unmount.
+ */
+ inode = fsnotify_detach_connector_from_object(conn);
+ spin_unlock(&conn->lock);
+ if (old_mark)
+ fsnotify_put_mark(old_mark);
+ iput(inode);
}
/*
* Nothing fancy, just initialize lists and locks and counters.
*/
void fsnotify_init_mark(struct fsnotify_mark *mark,
- void (*free_mark)(struct fsnotify_mark *mark))
+ struct fsnotify_group *group)
{
memset(mark, 0, sizeof(*mark));
spin_lock_init(&mark->lock);
atomic_set(&mark->refcnt, 1);
- mark->free_mark = free_mark;
+ fsnotify_get_group(group);
+ mark->group = group;
}
/*
* Destroy all marks in destroy_list, waits for SRCU period to finish before
* actually freeing marks.
*/
-void fsnotify_mark_destroy_list(void)
+static void fsnotify_mark_destroy_workfn(struct work_struct *work)
{
struct fsnotify_mark *mark, *next;
struct list_head private_destroy_list;
@@ -540,11 +761,12 @@ void fsnotify_mark_destroy_list(void)
list_for_each_entry_safe(mark, next, &private_destroy_list, g_list) {
list_del_init(&mark->g_list);
- fsnotify_put_mark(mark);
+ fsnotify_final_mark_destroy(mark);
}
}
-static void fsnotify_mark_destroy_workfn(struct work_struct *work)
+/* Wait for all marks queued for destruction to be actually destroyed */
+void fsnotify_wait_marks_destroyed(void)
{
- fsnotify_mark_destroy_list();
+ flush_delayed_work(&reaper_work);
}
diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c
deleted file mode 100644
index a8fcab68faef..000000000000
--- a/fs/notify/vfsmount_mark.c
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2, or (at your option)
- * any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; see the file COPYING. If not, write to
- * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-#include <linux/fs.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/mount.h>
-#include <linux/mutex.h>
-#include <linux/spinlock.h>
-
-#include <linux/atomic.h>
-
-#include <linux/fsnotify_backend.h>
-#include "fsnotify.h"
-
-void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group)
-{
- fsnotify_clear_marks_by_group_flags(group, FSNOTIFY_MARK_FLAG_VFSMOUNT);
-}
-
-/*
- * Recalculate the mnt->mnt_fsnotify_mask, or the mask of all FS_* event types
- * any notifier is interested in hearing for this mount point
- */
-void fsnotify_recalc_vfsmount_mask(struct vfsmount *mnt)
-{
- struct mount *m = real_mount(mnt);
-
- spin_lock(&mnt->mnt_root->d_lock);
- m->mnt_fsnotify_mask = fsnotify_recalc_mask(&m->mnt_fsnotify_marks);
- spin_unlock(&mnt->mnt_root->d_lock);
-}
-
-void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark)
-{
- struct vfsmount *mnt = mark->mnt;
- struct mount *m = real_mount(mnt);
-
- BUG_ON(!mutex_is_locked(&mark->group->mark_mutex));
- assert_spin_locked(&mark->lock);
-
- spin_lock(&mnt->mnt_root->d_lock);
-
- hlist_del_init_rcu(&mark->obj_list);
- mark->mnt = NULL;
-
- m->mnt_fsnotify_mask = fsnotify_recalc_mask(&m->mnt_fsnotify_marks);
- spin_unlock(&mnt->mnt_root->d_lock);
-}
-
-/*
- * given a group and vfsmount, find the mark associated with that combination.
- * if found take a reference to that mark and return it, else return NULL
- */
-struct fsnotify_mark *fsnotify_find_vfsmount_mark(struct fsnotify_group *group,
- struct vfsmount *mnt)
-{
- struct mount *m = real_mount(mnt);
- struct fsnotify_mark *mark;
-
- spin_lock(&mnt->mnt_root->d_lock);
- mark = fsnotify_find_mark(&m->mnt_fsnotify_marks, group);
- spin_unlock(&mnt->mnt_root->d_lock);
-
- return mark;
-}
-
-/*
- * Attach an initialized mark to a given group and vfsmount.
- * These marks may be used for the fsnotify backend to determine which
- * event types should be delivered to which groups.
- */
-int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark,
- struct fsnotify_group *group, struct vfsmount *mnt,
- int allow_dups)
-{
- struct mount *m = real_mount(mnt);
- int ret;
-
- mark->flags |= FSNOTIFY_MARK_FLAG_VFSMOUNT;
-
- BUG_ON(!mutex_is_locked(&group->mark_mutex));
- assert_spin_locked(&mark->lock);
-
- spin_lock(&mnt->mnt_root->d_lock);
- mark->mnt = mnt;
- ret = fsnotify_add_mark_list(&m->mnt_fsnotify_marks, mark, allow_dups);
- m->mnt_fsnotify_mask = fsnotify_recalc_mask(&m->mnt_fsnotify_marks);
- spin_unlock(&mnt->mnt_root->d_lock);
-
- return ret;
-}
diff --git a/fs/nsfs.c b/fs/nsfs.c
index 1656843e87d2..323f492e0822 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -91,6 +91,7 @@ slow:
return ERR_PTR(-ENOMEM);
}
d_instantiate(dentry, inode);
+ dentry->d_flags |= DCACHE_RCUACCESS;
dentry->d_fsdata = (void *)ns->ops;
d = atomic_long_cmpxchg(&ns->stashed, 0, (unsigned long)dentry);
if (d) {
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index f6e871760f8d..0da0332725aa 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -2242,13 +2242,13 @@ unlock:
spin_unlock(&o2hb_live_lock);
}
-static ssize_t o2hb_heartbeat_group_threshold_show(struct config_item *item,
+static ssize_t o2hb_heartbeat_group_dead_threshold_show(struct config_item *item,
char *page)
{
return sprintf(page, "%u\n", o2hb_dead_threshold);
}
-static ssize_t o2hb_heartbeat_group_threshold_store(struct config_item *item,
+static ssize_t o2hb_heartbeat_group_dead_threshold_store(struct config_item *item,
const char *page, size_t count)
{
unsigned long tmp;
@@ -2297,11 +2297,11 @@ static ssize_t o2hb_heartbeat_group_mode_store(struct config_item *item,
}
-CONFIGFS_ATTR(o2hb_heartbeat_group_, threshold);
+CONFIGFS_ATTR(o2hb_heartbeat_group_, dead_threshold);
CONFIGFS_ATTR(o2hb_heartbeat_group_, mode);
static struct configfs_attribute *o2hb_heartbeat_group_attrs[] = {
- &o2hb_heartbeat_group_attr_threshold,
+ &o2hb_heartbeat_group_attr_dead_threshold,
&o2hb_heartbeat_group_attr_mode,
NULL,
};
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index d0ab7e56d0b4..8d779227370a 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -450,9 +450,8 @@ static struct o2net_sock_container *sc_alloc(struct o2nm_node *node)
INIT_WORK(&sc->sc_shutdown_work, o2net_shutdown_sc);
INIT_DELAYED_WORK(&sc->sc_keepalive_work, o2net_sc_send_keep_req);
- init_timer(&sc->sc_idle_timeout);
- sc->sc_idle_timeout.function = o2net_idle_timer;
- sc->sc_idle_timeout.data = (unsigned long)sc;
+ setup_timer(&sc->sc_idle_timeout, o2net_idle_timer,
+ (unsigned long)sc);
sclog(sc, "alloced\n");
@@ -956,7 +955,7 @@ static void o2net_sendpage(struct o2net_sock_container *sc,
mutex_lock(&sc->sc_send_lock);
ret = sc->sc_sock->ops->sendpage(sc->sc_sock,
virt_to_page(kmalloced_virt),
- (long)kmalloced_virt & ~PAGE_MASK,
+ offset_in_page(kmalloced_virt),
size, MSG_DONTWAIT);
mutex_unlock(&sc->sc_send_lock);
if (ret == size)
@@ -1460,27 +1459,10 @@ static void o2net_rx_until_empty(struct work_struct *work)
static int o2net_set_nodelay(struct socket *sock)
{
- int ret, val = 1;
- mm_segment_t oldfs;
+ int val = 1;
- oldfs = get_fs();
- set_fs(KERNEL_DS);
-
- /*
- * Dear unsuspecting programmer,
- *
- * Don't use sock_setsockopt() for SOL_TCP. It doesn't check its level
- * argument and assumes SOL_SOCKET so, say, your TCP_NODELAY will
- * silently turn into SO_DEBUG.
- *
- * Yours,
- * Keeper of hilariously fragile interfaces.
- */
- ret = sock->ops->setsockopt(sock, SOL_TCP, TCP_NODELAY,
- (char __user *)&val, sizeof(val));
-
- set_fs(oldfs);
- return ret;
+ return kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY,
+ (void *)&val, sizeof(val));
}
static int o2net_set_usertimeout(struct socket *sock)
@@ -1488,7 +1470,7 @@ static int o2net_set_usertimeout(struct socket *sock)
int user_timeout = O2NET_TCP_USER_TIMEOUT;
return kernel_setsockopt(sock, SOL_TCP, TCP_USER_TIMEOUT,
- (char *)&user_timeout, sizeof(user_timeout));
+ (void *)&user_timeout, sizeof(user_timeout));
}
static void o2net_initialize_handshake(void)
diff --git a/fs/open.c b/fs/open.c
index 949cef29c3bb..4d23f729dcc6 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -1078,6 +1078,26 @@ SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags,
return do_sys_open(dfd, filename, flags, mode);
}
+#ifdef CONFIG_COMPAT
+/*
+ * Exactly like sys_open(), except that it doesn't set the
+ * O_LARGEFILE flag.
+ */
+COMPAT_SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode)
+{
+ return do_sys_open(AT_FDCWD, filename, flags, mode);
+}
+
+/*
+ * Exactly like sys_openat(), except that it doesn't set the
+ * O_LARGEFILE flag.
+ */
+COMPAT_SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags, umode_t, mode)
+{
+ return do_sys_open(dfd, filename, flags, mode);
+}
+#endif
+
#ifndef __alpha__
/*
diff --git a/fs/orangefs/devorangefs-req.c b/fs/orangefs/devorangefs-req.c
index c4ab6fdf17a0..c19f0787c9c6 100644
--- a/fs/orangefs/devorangefs-req.c
+++ b/fs/orangefs/devorangefs-req.c
@@ -180,6 +180,10 @@ static ssize_t orangefs_devreq_read(struct file *file,
return -EINVAL;
}
+ /* Check for an empty list before locking. */
+ if (list_empty(&orangefs_request_list))
+ return -EAGAIN;
+
restart:
/* Get next op (if any) from top of list. */
spin_lock(&orangefs_request_list_lock);
@@ -208,14 +212,19 @@ restart:
continue;
/*
* Skip ops whose filesystem we don't know about unless
- * it is being mounted.
+ * it is being mounted or unmounted. It is possible for
+ * a filesystem we don't know about to be unmounted if
+ * it fails to mount in the kernel after userspace has
+ * been sent the mount request.
*/
/* XXX: is there a better way to detect this? */
} else if (ret == -1 &&
!(op->upcall.type ==
ORANGEFS_VFS_OP_FS_MOUNT ||
op->upcall.type ==
- ORANGEFS_VFS_OP_GETATTR)) {
+ ORANGEFS_VFS_OP_GETATTR ||
+ op->upcall.type ==
+ ORANGEFS_VFS_OP_FS_UMOUNT)) {
gossip_debug(GOSSIP_DEV_DEBUG,
"orangefs: skipping op tag %llu %s\n",
llu(op->tag), get_opname_string(op));
diff --git a/fs/orangefs/dir.c b/fs/orangefs/dir.c
index 284373a57a08..d327cbd17756 100644
--- a/fs/orangefs/dir.c
+++ b/fs/orangefs/dir.c
@@ -1,396 +1,404 @@
/*
- * (C) 2001 Clemson University and The University of Chicago
- *
- * See COPYING in top-level directory.
+ * Copyright 2017 Omnibond Systems, L.L.C.
*/
#include "protocol.h"
#include "orangefs-kernel.h"
#include "orangefs-bufmap.h"
+struct orangefs_dir_part {
+ struct orangefs_dir_part *next;
+ size_t len;
+};
+
+struct orangefs_dir {
+ __u64 token;
+ struct orangefs_dir_part *part;
+ loff_t end;
+ int error;
+};
+
+#define PART_SHIFT (24)
+#define PART_SIZE (1<<24)
+#define PART_MASK (~(PART_SIZE - 1))
+
/*
- * decode routine used by kmod to deal with the blob sent from
- * userspace for readdirs. The blob contains zero or more of these
- * sub-blobs:
- * __u32 - represents length of the character string that follows.
- * string - between 1 and ORANGEFS_NAME_MAX bytes long.
- * padding - (if needed) to cause the __u32 plus the string to be
- * eight byte aligned.
- * khandle - sizeof(khandle) bytes.
+ * There can be up to 512 directory entries. Each entry is encoded as
+ * follows:
+ * 4 bytes: string size (n)
+ * n bytes: string
+ * 1 byte: trailing zero
+ * padding to 8 bytes
+ * 16 bytes: khandle
+ * padding to 8 bytes
+ *
+ * The trailer_buf starts with a struct orangefs_readdir_response_s
+ * which must be skipped to get to the directory data.
+ *
+ * The data which is received from the userspace daemon is termed a
+ * part and is stored in a linked list in case more than one part is
+ * needed for a large directory.
+ *
+ * The position pointer (ctx->pos) encodes the part and offset on which
+ * to begin reading at. Bits above PART_SHIFT encode the part and bits
+ * below PART_SHIFT encode the offset. Parts are stored in a linked
+ * list which grows as data is received from the server. The overhead
+ * associated with managing the list is presumed to be small compared to
+ * the overhead of communicating with the server.
+ *
+ * As data is received from the server, it is placed at the end of the
+ * part list. Data is parsed from the current position as it is needed.
+ * When data is determined to be corrupt, it is either because the
+ * userspace component has sent back corrupt data or because the file
+ * pointer has been moved to an invalid location. Since the two cannot
+ * be differentiated, return EIO.
+ *
+ * Part zero is synthesized to contains `.' and `..'. Part one is the
+ * first part of the part list.
*/
-static long decode_dirents(char *ptr, size_t size,
- struct orangefs_readdir_response_s *readdir)
+
+static int do_readdir(struct orangefs_inode_s *oi,
+ struct orangefs_dir *od, struct dentry *dentry,
+ struct orangefs_kernel_op_s *op)
{
- int i;
- struct orangefs_readdir_response_s *rd =
- (struct orangefs_readdir_response_s *) ptr;
- char *buf = ptr;
- int khandle_size = sizeof(struct orangefs_khandle);
- size_t offset = offsetof(struct orangefs_readdir_response_s,
- dirent_array);
- /* 8 reflects eight byte alignment */
- int smallest_blob = khandle_size + 8;
- __u32 len;
- int aligned_len;
- int sizeof_u32 = sizeof(__u32);
- long ret;
-
- gossip_debug(GOSSIP_DIR_DEBUG, "%s: size:%zu:\n", __func__, size);
-
- /* size is = offset on empty dirs, > offset on non-empty dirs... */
- if (size < offset) {
- gossip_err("%s: size:%zu: offset:%zu:\n",
- __func__,
- size,
- offset);
- ret = -EINVAL;
- goto out;
- }
+ struct orangefs_readdir_response_s *resp;
+ int bufi, r;
- if ((size == offset) && (readdir->orangefs_dirent_outcount != 0)) {
- gossip_err("%s: size:%zu: dirent_outcount:%d:\n",
- __func__,
- size,
- readdir->orangefs_dirent_outcount);
- ret = -EINVAL;
- goto out;
- }
+ /*
+ * Despite the badly named field, readdir does not use shared
+ * memory. However, there are a limited number of readdir
+ * slots, which must be allocated here. This flag simply tells
+ * the op scheduler to return the op here for retry.
+ */
+ op->uses_shared_memory = 1;
+ op->upcall.req.readdir.refn = oi->refn;
+ op->upcall.req.readdir.token = od->token;
+ op->upcall.req.readdir.max_dirent_count =
+ ORANGEFS_MAX_DIRENT_COUNT_READDIR;
- readdir->token = rd->token;
- readdir->orangefs_dirent_outcount = rd->orangefs_dirent_outcount;
- readdir->dirent_array = kcalloc(readdir->orangefs_dirent_outcount,
- sizeof(*readdir->dirent_array),
- GFP_KERNEL);
- if (readdir->dirent_array == NULL) {
- gossip_err("%s: kcalloc failed.\n", __func__);
- ret = -ENOMEM;
- goto out;
+again:
+ bufi = orangefs_readdir_index_get();
+ if (bufi < 0) {
+ od->error = bufi;
+ return bufi;
}
- buf += offset;
- size -= offset;
-
- for (i = 0; i < readdir->orangefs_dirent_outcount; i++) {
- if (size < smallest_blob) {
- gossip_err("%s: size:%zu: smallest_blob:%d:\n",
- __func__,
- size,
- smallest_blob);
- ret = -EINVAL;
- goto free;
- }
+ op->upcall.req.readdir.buf_index = bufi;
- len = *(__u32 *)buf;
- if ((len < 1) || (len > ORANGEFS_NAME_MAX)) {
- gossip_err("%s: len:%d:\n", __func__, len);
- ret = -EINVAL;
- goto free;
- }
+ r = service_operation(op, "orangefs_readdir",
+ get_interruptible_flag(dentry->d_inode));
- gossip_debug(GOSSIP_DIR_DEBUG,
- "%s: size:%zu: len:%d:\n",
- __func__,
- size,
- len);
+ orangefs_readdir_index_put(bufi);
- readdir->dirent_array[i].d_name = buf + sizeof_u32;
- readdir->dirent_array[i].d_length = len;
+ if (op_state_purged(op)) {
+ if (r == -EAGAIN) {
+ vfree(op->downcall.trailer_buf);
+ goto again;
+ } else if (r == -EIO) {
+ vfree(op->downcall.trailer_buf);
+ od->error = r;
+ return r;
+ }
+ }
- /*
- * Calculate "aligned" length of this string and its
- * associated __u32 descriptor.
- */
- aligned_len = ((sizeof_u32 + len + 1) + 7) & ~7;
- gossip_debug(GOSSIP_DIR_DEBUG,
- "%s: aligned_len:%d:\n",
- __func__,
- aligned_len);
+ if (r < 0) {
+ vfree(op->downcall.trailer_buf);
+ od->error = r;
+ return r;
+ } else if (op->downcall.status) {
+ vfree(op->downcall.trailer_buf);
+ od->error = op->downcall.status;
+ return op->downcall.status;
+ }
- /*
- * The end of the blob should coincide with the end
- * of the last sub-blob.
- */
- if (size < aligned_len + khandle_size) {
- gossip_err("%s: ran off the end of the blob.\n",
- __func__);
- ret = -EINVAL;
- goto free;
- }
- size -= aligned_len + khandle_size;
+ /*
+ * The maximum size is size per entry times the 512 entries plus
+ * the header. This is well under the limit.
+ */
+ if (op->downcall.trailer_size > PART_SIZE) {
+ vfree(op->downcall.trailer_buf);
+ od->error = -EIO;
+ return -EIO;
+ }
- buf += aligned_len;
+ resp = (struct orangefs_readdir_response_s *)
+ op->downcall.trailer_buf;
+ od->token = resp->token;
+ return 0;
+}
- readdir->dirent_array[i].khandle =
- *(struct orangefs_khandle *) buf;
- buf += khandle_size;
+static int parse_readdir(struct orangefs_dir *od,
+ struct orangefs_kernel_op_s *op)
+{
+ struct orangefs_dir_part *part, *new;
+ size_t count;
+
+ count = 1;
+ part = od->part;
+ while (part) {
+ count++;
+ if (part->next)
+ part = part->next;
+ else
+ break;
}
- ret = buf - ptr;
- gossip_debug(GOSSIP_DIR_DEBUG, "%s: returning:%ld:\n", __func__, ret);
- goto out;
-free:
- kfree(readdir->dirent_array);
- readdir->dirent_array = NULL;
+ new = (void *)op->downcall.trailer_buf;
+ new->next = NULL;
+ new->len = op->downcall.trailer_size -
+ sizeof(struct orangefs_readdir_response_s);
+ if (!od->part)
+ od->part = new;
+ else
+ part->next = new;
+ count++;
+ od->end = count << PART_SHIFT;
-out:
- return ret;
+ return 0;
}
-/*
- * Read directory entries from an instance of an open directory.
- */
-static int orangefs_readdir(struct file *file, struct dir_context *ctx)
+static int orangefs_dir_more(struct orangefs_inode_s *oi,
+ struct orangefs_dir *od, struct dentry *dentry)
{
- int ret = 0;
- int buffer_index;
- /*
- * ptoken supports Orangefs' distributed directory logic, added
- * in 2.9.2.
- */
- __u64 *ptoken = file->private_data;
- __u64 pos = 0;
- ino_t ino = 0;
- struct dentry *dentry = file->f_path.dentry;
- struct orangefs_kernel_op_s *new_op = NULL;
- struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(dentry->d_inode);
- struct orangefs_readdir_response_s readdir_response;
- void *dents_buf;
- int i = 0;
- int len = 0;
- ino_t current_ino = 0;
- char *current_entry = NULL;
- long bytes_decoded;
-
- gossip_debug(GOSSIP_DIR_DEBUG,
- "%s: ctx->pos:%lld, ptoken = %llu\n",
- __func__,
- lld(ctx->pos),
- llu(*ptoken));
-
- pos = (__u64) ctx->pos;
-
- /* are we done? */
- if (pos == ORANGEFS_READDIR_END) {
- gossip_debug(GOSSIP_DIR_DEBUG,
- "Skipping to termination path\n");
- return 0;
+ struct orangefs_kernel_op_s *op;
+ int r;
+
+ op = op_alloc(ORANGEFS_VFS_OP_READDIR);
+ if (!op) {
+ od->error = -ENOMEM;
+ return -ENOMEM;
+ }
+ r = do_readdir(oi, od, dentry, op);
+ if (r) {
+ od->error = r;
+ goto out;
+ }
+ r = parse_readdir(od, op);
+ if (r) {
+ od->error = r;
+ goto out;
}
- gossip_debug(GOSSIP_DIR_DEBUG,
- "orangefs_readdir called on %pd (pos=%llu)\n",
- dentry, llu(pos));
+ od->error = 0;
+out:
+ op_release(op);
+ return od->error;
+}
- memset(&readdir_response, 0, sizeof(readdir_response));
+static int fill_from_part(struct orangefs_dir_part *part,
+ struct dir_context *ctx)
+{
+ const int offset = sizeof(struct orangefs_readdir_response_s);
+ struct orangefs_khandle *khandle;
+ __u32 *len, padlen;
+ loff_t i;
+ char *s;
+ i = ctx->pos & ~PART_MASK;
- new_op = op_alloc(ORANGEFS_VFS_OP_READDIR);
- if (!new_op)
- return -ENOMEM;
+ /* The file offset from userspace is too large. */
+ if (i > part->len)
+ return 1;
/*
- * Only the indices are shared. No memory is actually shared, but the
- * mechanism is used.
+ * If the seek pointer is positioned just before an entry it
+ * should find the next entry.
*/
- new_op->uses_shared_memory = 1;
- new_op->upcall.req.readdir.refn = orangefs_inode->refn;
- new_op->upcall.req.readdir.max_dirent_count =
- ORANGEFS_MAX_DIRENT_COUNT_READDIR;
-
- gossip_debug(GOSSIP_DIR_DEBUG,
- "%s: upcall.req.readdir.refn.khandle: %pU\n",
- __func__,
- &new_op->upcall.req.readdir.refn.khandle);
+ if (i % 8)
+ i = i + (8 - i%8)%8;
- new_op->upcall.req.readdir.token = *ptoken;
-
-get_new_buffer_index:
- buffer_index = orangefs_readdir_index_get();
- if (buffer_index < 0) {
- ret = buffer_index;
- gossip_lerr("orangefs_readdir: orangefs_readdir_index_get() failure (%d)\n",
- ret);
- goto out_free_op;
+ while (i < part->len) {
+ if (part->len < i + sizeof *len)
+ break;
+ len = (void *)part + offset + i;
+ /*
+ * len is the size of the string itself. padlen is the
+ * total size of the encoded string.
+ */
+ padlen = (sizeof *len + *len + 1) +
+ (8 - (sizeof *len + *len + 1)%8)%8;
+ if (part->len < i + padlen + sizeof *khandle)
+ goto next;
+ s = (void *)part + offset + i + sizeof *len;
+ if (s[*len] != 0)
+ goto next;
+ khandle = (void *)part + offset + i + padlen;
+ if (!dir_emit(ctx, s, *len,
+ orangefs_khandle_to_ino(khandle),
+ DT_UNKNOWN))
+ return 0;
+ i += padlen + sizeof *khandle;
+ i = i + (8 - i%8)%8;
+ BUG_ON(i > part->len);
+ ctx->pos = (ctx->pos & PART_MASK) | i;
+ continue;
+next:
+ i += 8;
}
- new_op->upcall.req.readdir.buf_index = buffer_index;
-
- ret = service_operation(new_op,
- "orangefs_readdir",
- get_interruptible_flag(dentry->d_inode));
+ return 1;
+}
- gossip_debug(GOSSIP_DIR_DEBUG,
- "Readdir downcall status is %d. ret:%d\n",
- new_op->downcall.status,
- ret);
+static int orangefs_dir_fill(struct orangefs_inode_s *oi,
+ struct orangefs_dir *od, struct dentry *dentry,
+ struct dir_context *ctx)
+{
+ struct orangefs_dir_part *part;
+ size_t count;
- orangefs_readdir_index_put(buffer_index);
+ count = ((ctx->pos & PART_MASK) >> PART_SHIFT) - 1;
- if (ret == -EAGAIN && op_state_purged(new_op)) {
- /* Client-core indices are invalid after it restarted. */
- gossip_debug(GOSSIP_DIR_DEBUG,
- "%s: Getting new buffer_index for retry of readdir..\n",
- __func__);
- goto get_new_buffer_index;
+ part = od->part;
+ while (part->next && count) {
+ count--;
+ part = part->next;
}
-
- if (ret == -EIO && op_state_purged(new_op)) {
- gossip_err("%s: Client is down. Aborting readdir call.\n",
- __func__);
- goto out_free_op;
+ /* This means the userspace file offset is invalid. */
+ if (count) {
+ od->error = -EIO;
+ return -EIO;
}
- if (ret < 0 || new_op->downcall.status != 0) {
- gossip_debug(GOSSIP_DIR_DEBUG,
- "Readdir request failed. Status:%d\n",
- new_op->downcall.status);
- if (ret >= 0)
- ret = new_op->downcall.status;
- goto out_free_op;
+ while (part && part->len) {
+ int r;
+ r = fill_from_part(part, ctx);
+ if (r < 0) {
+ od->error = r;
+ return r;
+ } else if (r == 0) {
+ /* Userspace buffer is full. */
+ break;
+ } else {
+ /*
+ * The part ran out of data. Move to the next
+ * part. */
+ ctx->pos = (ctx->pos & PART_MASK) +
+ (1 << PART_SHIFT);
+ part = part->next;
+ }
}
+ return 0;
+}
- dents_buf = new_op->downcall.trailer_buf;
- if (dents_buf == NULL) {
- gossip_err("Invalid NULL buffer in readdir response\n");
- ret = -ENOMEM;
- goto out_free_op;
+static loff_t orangefs_dir_llseek(struct file *file, loff_t offset,
+ int whence)
+{
+ struct orangefs_dir *od = file->private_data;
+ /*
+ * Delete the stored data so userspace sees new directory
+ * entries.
+ */
+ if (!whence && offset < od->end) {
+ struct orangefs_dir_part *part = od->part;
+ while (part) {
+ struct orangefs_dir_part *next = part->next;
+ vfree(part);
+ part = next;
+ }
+ od->token = ORANGEFS_ITERATE_START;
+ od->part = NULL;
+ od->end = 1 << PART_SHIFT;
}
+ return default_llseek(file, offset, whence);
+}
- bytes_decoded = decode_dirents(dents_buf, new_op->downcall.trailer_size,
- &readdir_response);
- if (bytes_decoded < 0) {
- ret = bytes_decoded;
- gossip_err("Could not decode readdir from buffer %d\n", ret);
- goto out_vfree;
- }
+static int orangefs_dir_iterate(struct file *file,
+ struct dir_context *ctx)
+{
+ struct orangefs_inode_s *oi;
+ struct orangefs_dir *od;
+ struct dentry *dentry;
+ int r;
- if (bytes_decoded != new_op->downcall.trailer_size) {
- gossip_err("orangefs_readdir: # bytes decoded (%ld) "
- "!= trailer size (%ld)\n",
- bytes_decoded,
- (long)new_op->downcall.trailer_size);
- ret = -EINVAL;
- goto out_destroy_handle;
- }
+ dentry = file->f_path.dentry;
+ oi = ORANGEFS_I(dentry->d_inode);
+ od = file->private_data;
- /*
- * orangefs doesn't actually store dot and dot-dot, but
- * we need to have them represented.
- */
- if (pos == 0) {
- ino = get_ino_from_khandle(dentry->d_inode);
- gossip_debug(GOSSIP_DIR_DEBUG,
- "%s: calling dir_emit of \".\" with pos = %llu\n",
- __func__,
- llu(pos));
- ret = dir_emit(ctx, ".", 1, ino, DT_DIR);
- pos += 1;
- }
+ if (od->error)
+ return od->error;
- if (pos == 1) {
- ino = get_parent_ino_from_dentry(dentry);
- gossip_debug(GOSSIP_DIR_DEBUG,
- "%s: calling dir_emit of \"..\" with pos = %llu\n",
- __func__,
- llu(pos));
- ret = dir_emit(ctx, "..", 2, ino, DT_DIR);
- pos += 1;
+ if (ctx->pos == 0) {
+ if (!dir_emit_dot(file, ctx))
+ return 0;
+ ctx->pos++;
+ }
+ if (ctx->pos == 1) {
+ if (!dir_emit_dotdot(file, ctx))
+ return 0;
+ ctx->pos = 1 << PART_SHIFT;
}
/*
- * we stored ORANGEFS_ITERATE_NEXT in ctx->pos last time around
- * to prevent "finding" dot and dot-dot on any iteration
- * other than the first.
+ * The seek position is in the first synthesized part but is not
+ * valid.
*/
- if (ctx->pos == ORANGEFS_ITERATE_NEXT)
- ctx->pos = 0;
-
- gossip_debug(GOSSIP_DIR_DEBUG,
- "%s: dirent_outcount:%d:\n",
- __func__,
- readdir_response.orangefs_dirent_outcount);
- for (i = ctx->pos;
- i < readdir_response.orangefs_dirent_outcount;
- i++) {
- len = readdir_response.dirent_array[i].d_length;
- current_entry = readdir_response.dirent_array[i].d_name;
- current_ino = orangefs_khandle_to_ino(
- &readdir_response.dirent_array[i].khandle);
-
- gossip_debug(GOSSIP_DIR_DEBUG,
- "calling dir_emit for %s with len %d"
- ", ctx->pos %ld\n",
- current_entry,
- len,
- (unsigned long)ctx->pos);
- /*
- * type is unknown. We don't return object type
- * in the dirent_array. This leaves getdents
- * clueless about type.
- */
- ret =
- dir_emit(ctx, current_entry, len, current_ino, DT_UNKNOWN);
- if (!ret)
- break;
- ctx->pos++;
- gossip_debug(GOSSIP_DIR_DEBUG,
- "%s: ctx->pos:%lld\n",
- __func__,
- lld(ctx->pos));
+ if ((ctx->pos & PART_MASK) == 0)
+ return -EIO;
- }
+ r = 0;
/*
- * we ran all the way through the last batch, set up for
- * getting another batch...
+ * Must read more if the user has sought past what has been read
+ * so far. Stop a user who has sought past the end.
*/
- if (ret) {
- *ptoken = readdir_response.token;
- ctx->pos = ORANGEFS_ITERATE_NEXT;
+ while (od->token != ORANGEFS_ITERATE_END &&
+ ctx->pos > od->end) {
+ r = orangefs_dir_more(oi, od, dentry);
+ if (r)
+ return r;
+ }
+ if (od->token == ORANGEFS_ITERATE_END && ctx->pos > od->end)
+ return -EIO;
+
+ /* Then try to fill if there's any left in the buffer. */
+ if (ctx->pos < od->end) {
+ r = orangefs_dir_fill(oi, od, dentry, ctx);
+ if (r)
+ return r;
}
- /*
- * Did we hit the end of the directory?
- */
- if (readdir_response.token == ORANGEFS_READDIR_END) {
- gossip_debug(GOSSIP_DIR_DEBUG,
- "End of dir detected; setting ctx->pos to ORANGEFS_READDIR_END.\n");
- ctx->pos = ORANGEFS_READDIR_END;
+ /* Finally get some more and try to fill. */
+ if (od->token != ORANGEFS_ITERATE_END) {
+ r = orangefs_dir_more(oi, od, dentry);
+ if (r)
+ return r;
+ r = orangefs_dir_fill(oi, od, dentry, ctx);
}
-out_destroy_handle:
- /* kfree(NULL) is safe */
- kfree(readdir_response.dirent_array);
-out_vfree:
- gossip_debug(GOSSIP_DIR_DEBUG, "vfree %p\n", dents_buf);
- vfree(dents_buf);
-out_free_op:
- op_release(new_op);
- gossip_debug(GOSSIP_DIR_DEBUG, "orangefs_readdir returning %d\n", ret);
- return ret;
+ return r;
}
static int orangefs_dir_open(struct inode *inode, struct file *file)
{
- __u64 *ptoken;
-
- file->private_data = kmalloc(sizeof(__u64), GFP_KERNEL);
+ struct orangefs_dir *od;
+ file->private_data = kmalloc(sizeof(struct orangefs_dir),
+ GFP_KERNEL);
if (!file->private_data)
return -ENOMEM;
-
- ptoken = file->private_data;
- *ptoken = ORANGEFS_READDIR_START;
+ od = file->private_data;
+ od->token = ORANGEFS_ITERATE_START;
+ od->part = NULL;
+ od->end = 1 << PART_SHIFT;
+ od->error = 0;
return 0;
}
static int orangefs_dir_release(struct inode *inode, struct file *file)
{
+ struct orangefs_dir *od = file->private_data;
+ struct orangefs_dir_part *part = od->part;
orangefs_flush_inode(inode);
- kfree(file->private_data);
+ while (part) {
+ struct orangefs_dir_part *next = part->next;
+ vfree(part);
+ part = next;
+ }
+ kfree(od);
return 0;
}
-/** ORANGEFS implementation of VFS directory operations */
const struct file_operations orangefs_dir_operations = {
+ .llseek = orangefs_dir_llseek,
.read = generic_read_dir,
- .iterate = orangefs_readdir,
+ .iterate = orangefs_dir_iterate,
.open = orangefs_dir_open,
- .release = orangefs_dir_release,
+ .release = orangefs_dir_release
};
diff --git a/fs/orangefs/downcall.h b/fs/orangefs/downcall.h
index 3b8923f8bf21..163001c95501 100644
--- a/fs/orangefs/downcall.h
+++ b/fs/orangefs/downcall.h
@@ -40,16 +40,6 @@ struct orangefs_mkdir_response {
struct orangefs_object_kref refn;
};
-/*
- * duplication of some system interface structures so that I don't have
- * to allocate extra memory
- */
-struct orangefs_dirent {
- char *d_name;
- int d_length;
- struct orangefs_khandle khandle;
-};
-
struct orangefs_statfs_response {
__s64 block_size;
__s64 blocks_total;
@@ -131,12 +121,16 @@ struct orangefs_downcall_s {
} resp;
};
+/*
+ * The readdir response comes in the trailer. It is followed by the
+ * directory entries as described in dir.c.
+ */
+
struct orangefs_readdir_response_s {
__u64 token;
__u64 directory_version;
__u32 __pad2;
__u32 orangefs_dirent_outcount;
- struct orangefs_dirent *dirent_array;
};
#endif /* __DOWNCALL_H */
diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c
index e6bbc8083d77..28f38d813ad2 100644
--- a/fs/orangefs/file.c
+++ b/fs/orangefs/file.c
@@ -114,7 +114,6 @@ static ssize_t wait_for_direct_io(enum ORANGEFS_io_type type, struct inode *inod
struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
struct orangefs_khandle *handle = &orangefs_inode->refn.khandle;
struct orangefs_kernel_op_s *new_op = NULL;
- struct iov_iter saved = *iter;
int buffer_index = -1;
ssize_t ret;
@@ -193,7 +192,7 @@ populate_shared_memory:
orangefs_bufmap_put(buffer_index);
buffer_index = -1;
if (type == ORANGEFS_IO_WRITE)
- *iter = saved;
+ iov_iter_revert(iter, total_size);
gossip_debug(GOSSIP_FILE_DEBUG,
"%s:going to repopulate_shared_memory.\n",
__func__);
@@ -475,7 +474,8 @@ static ssize_t orangefs_file_write_iter(struct kiocb *iocb, struct iov_iter *ite
/* Make sure generic_write_checks sees an up to date inode size. */
if (file->f_flags & O_APPEND) {
- rc = orangefs_inode_getattr(file->f_mapping->host, 0, 1);
+ rc = orangefs_inode_getattr(file->f_mapping->host, 0, 1,
+ STATX_SIZE);
if (rc == -ESTALE)
rc = -EIO;
if (rc) {
@@ -693,7 +693,8 @@ static loff_t orangefs_file_llseek(struct file *file, loff_t offset, int origin)
* NOTE: We are only interested in file size here,
* so we set mask accordingly.
*/
- ret = orangefs_inode_getattr(file->f_mapping->host, 0, 1);
+ ret = orangefs_inode_getattr(file->f_mapping->host, 0, 1,
+ STATX_SIZE);
if (ret == -ESTALE)
ret = -EIO;
if (ret) {
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index a304bf34b212..9428ea0aac16 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -161,7 +161,7 @@ static int orangefs_setattr_size(struct inode *inode, struct iattr *iattr)
iattr->ia_size);
/* Ensure that we have a up to date size, so we know if it changed. */
- ret = orangefs_inode_getattr(inode, 0, 1);
+ ret = orangefs_inode_getattr(inode, 0, 1, STATX_SIZE);
if (ret == -ESTALE)
ret = -EIO;
if (ret) {
@@ -218,8 +218,7 @@ int orangefs_setattr(struct dentry *dentry, struct iattr *iattr)
if (ret)
goto out;
- if ((iattr->ia_valid & ATTR_SIZE) &&
- iattr->ia_size != i_size_read(inode)) {
+ if (iattr->ia_valid & ATTR_SIZE) {
ret = orangefs_setattr_size(inode, iattr);
if (ret)
goto out;
@@ -256,13 +255,19 @@ int orangefs_getattr(const struct path *path, struct kstat *stat,
"orangefs_getattr: called on %pd\n",
path->dentry);
- ret = orangefs_inode_getattr(inode, 0, 0);
+ ret = orangefs_inode_getattr(inode, 0, 0, request_mask);
if (ret == 0) {
generic_fillattr(inode, stat);
/* override block size reported to stat */
orangefs_inode = ORANGEFS_I(inode);
stat->blksize = orangefs_inode->blksize;
+
+ if (request_mask & STATX_SIZE)
+ stat->result_mask = STATX_BASIC_STATS;
+ else
+ stat->result_mask = STATX_BASIC_STATS &
+ ~STATX_SIZE;
}
return ret;
}
@@ -277,7 +282,7 @@ int orangefs_permission(struct inode *inode, int mask)
gossip_debug(GOSSIP_INODE_DEBUG, "%s: refreshing\n", __func__);
/* Make sure the permission (and other common attrs) are up to date. */
- ret = orangefs_inode_getattr(inode, 0, 0);
+ ret = orangefs_inode_getattr(inode, 0, 0, STATX_MODE);
if (ret < 0)
return ret;
@@ -375,7 +380,7 @@ struct inode *orangefs_iget(struct super_block *sb, struct orangefs_object_kref
if (!inode || !(inode->i_state & I_NEW))
return inode;
- error = orangefs_inode_getattr(inode, 1, 1);
+ error = orangefs_inode_getattr(inode, 1, 1, STATX_ALL);
if (error) {
iget_failed(inode);
return ERR_PTR(error);
@@ -420,7 +425,7 @@ struct inode *orangefs_new_inode(struct super_block *sb, struct inode *dir,
orangefs_set_inode(inode, ref);
inode->i_ino = hash; /* needed for stat etc */
- error = orangefs_inode_getattr(inode, 1, 1);
+ error = orangefs_inode_getattr(inode, 1, 1, STATX_ALL);
if (error)
goto out_iput;
diff --git a/fs/orangefs/namei.c b/fs/orangefs/namei.c
index a290ff6ec756..478e88bd7f9d 100644
--- a/fs/orangefs/namei.c
+++ b/fs/orangefs/namei.c
@@ -74,6 +74,7 @@ static int orangefs_create(struct inode *dir,
unlock_new_inode(inode);
orangefs_set_timeout(dentry);
ORANGEFS_I(inode)->getattr_time = jiffies - 1;
+ ORANGEFS_I(inode)->getattr_mask = STATX_BASIC_STATS;
gossip_debug(GOSSIP_NAME_DEBUG,
"%s: dentry instantiated for %pd\n",
@@ -193,8 +194,6 @@ static struct dentry *orangefs_lookup(struct inode *dir, struct dentry *dentry,
goto out;
}
- ORANGEFS_I(inode)->getattr_time = jiffies - 1;
-
gossip_debug(GOSSIP_NAME_DEBUG,
"%s:%s:%d "
"Found good inode [%lu] with count [%d]\n",
@@ -324,6 +323,7 @@ static int orangefs_symlink(struct inode *dir,
unlock_new_inode(inode);
orangefs_set_timeout(dentry);
ORANGEFS_I(inode)->getattr_time = jiffies - 1;
+ ORANGEFS_I(inode)->getattr_mask = STATX_BASIC_STATS;
gossip_debug(GOSSIP_NAME_DEBUG,
"Inode (Symlink) %pU -> %pd\n",
@@ -388,6 +388,7 @@ static int orangefs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
unlock_new_inode(inode);
orangefs_set_timeout(dentry);
ORANGEFS_I(inode)->getattr_time = jiffies - 1;
+ ORANGEFS_I(inode)->getattr_mask = STATX_BASIC_STATS;
gossip_debug(GOSSIP_NAME_DEBUG,
"Inode (Directory) %pU -> %pd\n",
diff --git a/fs/orangefs/orangefs-bufmap.c b/fs/orangefs/orangefs-bufmap.c
index 6333cbbdfef7..83b506020718 100644
--- a/fs/orangefs/orangefs-bufmap.c
+++ b/fs/orangefs/orangefs-bufmap.c
@@ -521,13 +521,11 @@ int orangefs_bufmap_copy_from_iovec(struct iov_iter *iter,
size_t n = size;
if (n > PAGE_SIZE)
n = PAGE_SIZE;
- n = copy_page_from_iter(page, 0, n, iter);
- if (!n)
+ if (copy_page_from_iter(page, 0, n, iter) != n)
return -EFAULT;
size -= n;
}
return 0;
-
}
/*
diff --git a/fs/orangefs/orangefs-debugfs.c b/fs/orangefs/orangefs-debugfs.c
index 791912da97d7..716ed337f166 100644
--- a/fs/orangefs/orangefs-debugfs.c
+++ b/fs/orangefs/orangefs-debugfs.c
@@ -440,6 +440,9 @@ static ssize_t orangefs_debug_write(struct file *file,
"orangefs_debug_write: %pD\n",
file);
+ if (count == 0)
+ return 0;
+
/*
* Thwart users who try to jamb a ridiculous number
* of bytes into the debug file...
diff --git a/fs/orangefs/orangefs-dev-proto.h b/fs/orangefs/orangefs-dev-proto.h
index f380f9ed1b28..efe08c763e56 100644
--- a/fs/orangefs/orangefs-dev-proto.h
+++ b/fs/orangefs/orangefs-dev-proto.h
@@ -52,12 +52,7 @@
*/
#define ORANGEFS_MAX_DEBUG_STRING_LEN 0x00000800
-/*
- * The maximum number of directory entries in a single request is 96.
- * XXX: Why can this not be higher. The client-side code can handle up to 512.
- * XXX: What happens if we expect more than the client can return?
- */
-#define ORANGEFS_MAX_DIRENT_COUNT_READDIR 96
+#define ORANGEFS_MAX_DIRENT_COUNT_READDIR 512
#include "upcall.h"
#include "downcall.h"
diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h
index 5e48a0be9761..ea0ce507a6ab 100644
--- a/fs/orangefs/orangefs-kernel.h
+++ b/fs/orangefs/orangefs-kernel.h
@@ -215,6 +215,7 @@ struct orangefs_inode_s {
unsigned long pinode_flags;
unsigned long getattr_time;
+ u32 getattr_mask;
};
#define P_ATIME_FLAG 0
@@ -249,6 +250,7 @@ struct orangefs_sb_info_s {
char devname[ORANGEFS_MAX_SERVER_ADDR_LEN];
struct super_block *sb;
int mount_pending;
+ int no_list;
struct list_head list;
};
@@ -339,11 +341,6 @@ static inline struct orangefs_khandle *get_khandle_from_ino(struct inode *inode)
return &(ORANGEFS_I(inode)->refn.khandle);
}
-static inline __s32 get_fsid_from_ino(struct inode *inode)
-{
- return ORANGEFS_I(inode)->refn.fs_id;
-}
-
static inline ino_t get_ino_from_khandle(struct inode *inode)
{
struct orangefs_khandle *khandle;
@@ -499,7 +496,8 @@ int orangefs_inode_setxattr(struct inode *inode,
size_t size,
int flags);
-int orangefs_inode_getattr(struct inode *inode, int new, int bypass);
+int orangefs_inode_getattr(struct inode *inode, int new, int bypass,
+ u32 request_mask);
int orangefs_inode_check_changed(struct inode *inode);
diff --git a/fs/orangefs/orangefs-utils.c b/fs/orangefs/orangefs-utils.c
index 9b96b99539d6..aab6f1842963 100644
--- a/fs/orangefs/orangefs-utils.c
+++ b/fs/orangefs/orangefs-utils.c
@@ -251,7 +251,8 @@ static int orangefs_inode_is_stale(struct inode *inode, int new,
return 0;
}
-int orangefs_inode_getattr(struct inode *inode, int new, int bypass)
+int orangefs_inode_getattr(struct inode *inode, int new, int bypass,
+ u32 request_mask)
{
struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
struct orangefs_kernel_op_s *new_op;
@@ -262,7 +263,13 @@ int orangefs_inode_getattr(struct inode *inode, int new, int bypass)
get_khandle_from_ino(inode));
if (!new && !bypass) {
- if (time_before(jiffies, orangefs_inode->getattr_time))
+ /*
+ * Must have all the attributes in the mask and be within cache
+ * time.
+ */
+ if ((request_mask & orangefs_inode->getattr_mask) ==
+ request_mask &&
+ time_before(jiffies, orangefs_inode->getattr_time))
return 0;
}
@@ -270,7 +277,15 @@ int orangefs_inode_getattr(struct inode *inode, int new, int bypass)
if (!new_op)
return -ENOMEM;
new_op->upcall.req.getattr.refn = orangefs_inode->refn;
- new_op->upcall.req.getattr.mask = ORANGEFS_ATTR_SYS_ALL_NOHINT;
+ /*
+ * Size is the hardest attribute to get. The incremental cost of any
+ * other attribute is essentially zero.
+ */
+ if (request_mask & STATX_SIZE || new)
+ new_op->upcall.req.getattr.mask = ORANGEFS_ATTR_SYS_ALL_NOHINT;
+ else
+ new_op->upcall.req.getattr.mask =
+ ORANGEFS_ATTR_SYS_ALL_NOHINT & ~ORANGEFS_ATTR_SYS_SIZE;
ret = service_operation(new_op, __func__,
get_interruptible_flag(inode));
@@ -291,25 +306,29 @@ int orangefs_inode_getattr(struct inode *inode, int new, int bypass)
case S_IFREG:
inode->i_flags = orangefs_inode_flags(&new_op->
downcall.resp.getattr.attributes);
- inode_size = (loff_t)new_op->
- downcall.resp.getattr.attributes.size;
- rounded_up_size =
- (inode_size + (4096 - (inode_size % 4096)));
- inode->i_size = inode_size;
- orangefs_inode->blksize =
- new_op->downcall.resp.getattr.attributes.blksize;
- spin_lock(&inode->i_lock);
- inode->i_bytes = inode_size;
- inode->i_blocks =
- (unsigned long)(rounded_up_size / 512);
- spin_unlock(&inode->i_lock);
+ if (request_mask & STATX_SIZE || new) {
+ inode_size = (loff_t)new_op->
+ downcall.resp.getattr.attributes.size;
+ rounded_up_size =
+ (inode_size + (4096 - (inode_size % 4096)));
+ inode->i_size = inode_size;
+ orangefs_inode->blksize =
+ new_op->downcall.resp.getattr.attributes.blksize;
+ spin_lock(&inode->i_lock);
+ inode->i_bytes = inode_size;
+ inode->i_blocks =
+ (unsigned long)(rounded_up_size / 512);
+ spin_unlock(&inode->i_lock);
+ }
break;
case S_IFDIR:
- inode->i_size = PAGE_SIZE;
- orangefs_inode->blksize = i_blocksize(inode);
- spin_lock(&inode->i_lock);
- inode_set_bytes(inode, inode->i_size);
- spin_unlock(&inode->i_lock);
+ if (request_mask & STATX_SIZE || new) {
+ inode->i_size = PAGE_SIZE;
+ orangefs_inode->blksize = i_blocksize(inode);
+ spin_lock(&inode->i_lock);
+ inode_set_bytes(inode, inode->i_size);
+ spin_unlock(&inode->i_lock);
+ }
set_nlink(inode, 1);
break;
case S_IFLNK:
@@ -349,6 +368,10 @@ int orangefs_inode_getattr(struct inode *inode, int new, int bypass)
orangefs_inode->getattr_time = jiffies +
orangefs_getattr_timeout_msecs*HZ/1000;
+ if (request_mask & STATX_SIZE || new)
+ orangefs_inode->getattr_mask = STATX_BASIC_STATS;
+ else
+ orangefs_inode->getattr_mask = STATX_BASIC_STATS & ~STATX_SIZE;
ret = 0;
out:
op_release(new_op);
@@ -500,41 +523,6 @@ int orangefs_flush_inode(struct inode *inode)
return ret;
}
-int orangefs_unmount_sb(struct super_block *sb)
-{
- int ret = -EINVAL;
- struct orangefs_kernel_op_s *new_op = NULL;
-
- gossip_debug(GOSSIP_UTILS_DEBUG,
- "orangefs_unmount_sb called on sb %p\n",
- sb);
-
- new_op = op_alloc(ORANGEFS_VFS_OP_FS_UMOUNT);
- if (!new_op)
- return -ENOMEM;
- new_op->upcall.req.fs_umount.id = ORANGEFS_SB(sb)->id;
- new_op->upcall.req.fs_umount.fs_id = ORANGEFS_SB(sb)->fs_id;
- strncpy(new_op->upcall.req.fs_umount.orangefs_config_server,
- ORANGEFS_SB(sb)->devname,
- ORANGEFS_MAX_SERVER_ADDR_LEN);
-
- gossip_debug(GOSSIP_UTILS_DEBUG,
- "Attempting ORANGEFS Unmount via host %s\n",
- new_op->upcall.req.fs_umount.orangefs_config_server);
-
- ret = service_operation(new_op, "orangefs_fs_umount", 0);
-
- gossip_debug(GOSSIP_UTILS_DEBUG,
- "orangefs_unmount: got return value of %d\n", ret);
- if (ret)
- sb = ERR_PTR(ret);
- else
- ORANGEFS_SB(sb)->mount_pending = 1;
-
- op_release(new_op);
- return ret;
-}
-
void orangefs_make_bad_inode(struct inode *inode)
{
if (is_root_handle(inode)) {
diff --git a/fs/orangefs/protocol.h b/fs/orangefs/protocol.h
index 971307ad69be..48bcc1bbe415 100644
--- a/fs/orangefs/protocol.h
+++ b/fs/orangefs/protocol.h
@@ -138,13 +138,8 @@ typedef __s64 ORANGEFS_offset;
#define ORANGEFS_G_SGID (1 << 10)
#define ORANGEFS_U_SUID (1 << 11)
-/* definition taken from stdint.h */
-#define INT32_MAX (2147483647)
-#define ORANGEFS_ITERATE_START (INT32_MAX - 1)
-#define ORANGEFS_ITERATE_END (INT32_MAX - 2)
-#define ORANGEFS_ITERATE_NEXT (INT32_MAX - 3)
-#define ORANGEFS_READDIR_START ORANGEFS_ITERATE_START
-#define ORANGEFS_READDIR_END ORANGEFS_ITERATE_END
+#define ORANGEFS_ITERATE_START 2147483646
+#define ORANGEFS_ITERATE_END 2147483645
#define ORANGEFS_IMMUTABLE_FL FS_IMMUTABLE_FL
#define ORANGEFS_APPEND_FL FS_APPEND_FL
#define ORANGEFS_NOATIME_FL FS_NOATIME_FL
diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c
index cd261c8de53a..5c7c273e17ec 100644
--- a/fs/orangefs/super.c
+++ b/fs/orangefs/super.c
@@ -376,6 +376,25 @@ static const struct export_operations orangefs_export_ops = {
.fh_to_dentry = orangefs_fh_to_dentry,
};
+static int orangefs_unmount(int id, __s32 fs_id, const char *devname)
+{
+ struct orangefs_kernel_op_s *op;
+ int r;
+ op = op_alloc(ORANGEFS_VFS_OP_FS_UMOUNT);
+ if (!op)
+ return -ENOMEM;
+ op->upcall.req.fs_umount.id = id;
+ op->upcall.req.fs_umount.fs_id = fs_id;
+ strncpy(op->upcall.req.fs_umount.orangefs_config_server,
+ devname, ORANGEFS_MAX_SERVER_ADDR_LEN);
+ r = service_operation(op, "orangefs_fs_umount", 0);
+ /* Not much to do about an error here. */
+ if (r)
+ gossip_err("orangefs_unmount: service_operation %d\n", r);
+ op_release(op);
+ return r;
+}
+
static int orangefs_fill_sb(struct super_block *sb,
struct orangefs_fs_mount_response *fs_mount,
void *data, int silent)
@@ -484,6 +503,8 @@ struct dentry *orangefs_mount(struct file_system_type *fst,
if (IS_ERR(sb)) {
d = ERR_CAST(sb);
+ orangefs_unmount(new_op->downcall.resp.fs_mount.id,
+ new_op->downcall.resp.fs_mount.fs_id, devname);
goto free_op;
}
@@ -493,7 +514,7 @@ struct dentry *orangefs_mount(struct file_system_type *fst,
if (ret) {
d = ERR_PTR(ret);
- goto free_op;
+ goto free_sb_and_op;
}
/*
@@ -519,6 +540,9 @@ struct dentry *orangefs_mount(struct file_system_type *fst,
spin_unlock(&orangefs_superblocks_lock);
op_release(new_op);
+ /* Must be removed from the list now. */
+ ORANGEFS_SB(sb)->no_list = 0;
+
if (orangefs_userspace_version >= 20906) {
new_op = op_alloc(ORANGEFS_VFS_OP_FEATURES);
if (!new_op)
@@ -533,6 +557,11 @@ struct dentry *orangefs_mount(struct file_system_type *fst,
return dget(sb->s_root);
+free_sb_and_op:
+ /* Will call orangefs_kill_sb with sb not in list. */
+ ORANGEFS_SB(sb)->no_list = 1;
+ /* ORANGEFS_VFS_OP_FS_UMOUNT is done by orangefs_kill_sb. */
+ deactivate_locked_super(sb);
free_op:
gossip_err("orangefs_mount: mount request failed with %d\n", ret);
if (ret == -EINVAL) {
@@ -547,6 +576,7 @@ free_op:
void orangefs_kill_sb(struct super_block *sb)
{
+ int r;
gossip_debug(GOSSIP_SUPER_DEBUG, "orangefs_kill_sb: called\n");
/* provided sb cleanup */
@@ -556,14 +586,19 @@ void orangefs_kill_sb(struct super_block *sb)
* issue the unmount to userspace to tell it to remove the
* dynamic mount info it has for this superblock
*/
- orangefs_unmount_sb(sb);
-
- /* remove the sb from our list of orangefs specific sb's */
-
- spin_lock(&orangefs_superblocks_lock);
- __list_del_entry(&ORANGEFS_SB(sb)->list); /* not list_del_init */
- ORANGEFS_SB(sb)->list.prev = NULL;
- spin_unlock(&orangefs_superblocks_lock);
+ r = orangefs_unmount(ORANGEFS_SB(sb)->id, ORANGEFS_SB(sb)->fs_id,
+ ORANGEFS_SB(sb)->devname);
+ if (!r)
+ ORANGEFS_SB(sb)->mount_pending = 1;
+
+ if (!ORANGEFS_SB(sb)->no_list) {
+ /* remove the sb from our list of orangefs specific sb's */
+ spin_lock(&orangefs_superblocks_lock);
+ /* not list_del_init */
+ __list_del_entry(&ORANGEFS_SB(sb)->list);
+ ORANGEFS_SB(sb)->list.prev = NULL;
+ spin_unlock(&orangefs_superblocks_lock);
+ }
/*
* make sure that ORANGEFS_DEV_REMOUNT_ALL loop that might've seen us
diff --git a/fs/orangefs/waitqueue.c b/fs/orangefs/waitqueue.c
index abcfa3fa9992..61e2ca7fec55 100644
--- a/fs/orangefs/waitqueue.c
+++ b/fs/orangefs/waitqueue.c
@@ -124,7 +124,14 @@ retry_servicing:
gossip_debug(GOSSIP_WAIT_DEBUG,
"%s:client core is NOT in service.\n",
__func__);
- timeout = op_timeout_secs * HZ;
+ /*
+ * Don't wait for the userspace component to return if
+ * the filesystem is being umounted anyway.
+ */
+ if (op->upcall.type == ORANGEFS_VFS_OP_FS_UMOUNT)
+ timeout = 0;
+ else
+ timeout = op_timeout_secs * HZ;
}
spin_unlock(&orangefs_request_list_lock);
diff --git a/fs/orangefs/xattr.c b/fs/orangefs/xattr.c
index 74a81b1daaac..237c9c04dc3b 100644
--- a/fs/orangefs/xattr.c
+++ b/fs/orangefs/xattr.c
@@ -76,11 +76,8 @@ ssize_t orangefs_inode_getxattr(struct inode *inode, const char *name,
if (S_ISLNK(inode->i_mode))
return -EOPNOTSUPP;
- if (strlen(name) >= ORANGEFS_MAX_XATTR_NAMELEN) {
- gossip_err("Invalid key length (%d)\n",
- (int)strlen(name));
+ if (strlen(name) > ORANGEFS_MAX_XATTR_NAMELEN)
return -EINVAL;
- }
fsuid = from_kuid(&init_user_ns, current_fsuid());
fsgid = from_kgid(&init_user_ns, current_fsgid());
@@ -172,6 +169,9 @@ static int orangefs_inode_removexattr(struct inode *inode, const char *name,
struct orangefs_kernel_op_s *new_op = NULL;
int ret = -ENOMEM;
+ if (strlen(name) > ORANGEFS_MAX_XATTR_NAMELEN)
+ return -EINVAL;
+
down_write(&orangefs_inode->xattr_sem);
new_op = op_alloc(ORANGEFS_VFS_OP_REMOVEXATTR);
if (!new_op)
@@ -231,23 +231,13 @@ int orangefs_inode_setxattr(struct inode *inode, const char *name,
"%s: name %s, buffer_size %zd\n",
__func__, name, size);
- if (size >= ORANGEFS_MAX_XATTR_VALUELEN ||
- flags < 0) {
- gossip_err("orangefs_inode_setxattr: bogus values of size(%d), flags(%d)\n",
- (int)size,
- flags);
+ if (size > ORANGEFS_MAX_XATTR_VALUELEN)
+ return -EINVAL;
+ if (strlen(name) > ORANGEFS_MAX_XATTR_NAMELEN)
return -EINVAL;
- }
internal_flag = convert_to_internal_xattr_flags(flags);
- if (strlen(name) >= ORANGEFS_MAX_XATTR_NAMELEN) {
- gossip_err
- ("orangefs_inode_setxattr: bogus key size (%d)\n",
- (int)(strlen(name)));
- return -EINVAL;
- }
-
/* This is equivalent to a removexattr */
if (size == 0 && value == NULL) {
gossip_debug(GOSSIP_XATTR_DEBUG,
@@ -358,7 +348,7 @@ try_again:
returned_count = new_op->downcall.resp.listxattr.returned_count;
if (returned_count < 0 ||
- returned_count >= ORANGEFS_MAX_XATTR_LISTLEN) {
+ returned_count > ORANGEFS_MAX_XATTR_LISTLEN) {
gossip_err("%s: impossible value for returned_count:%d:\n",
__func__,
returned_count);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index c87b6b9a8a76..9e3ac5c11780 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2834,6 +2834,15 @@ static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns,
return err;
}
+#ifdef CONFIG_LIVEPATCH
+static int proc_pid_patch_state(struct seq_file *m, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *task)
+{
+ seq_printf(m, "%d\n", task->patch_state);
+ return 0;
+}
+#endif /* CONFIG_LIVEPATCH */
+
/*
* Thread groups
*/
@@ -2933,6 +2942,9 @@ static const struct pid_entry tgid_base_stuff[] = {
REG("timers", S_IRUGO, proc_timers_operations),
#endif
REG("timerslack_ns", S_IRUGO|S_IWUGO, proc_pid_set_timerslack_ns_operations),
+#ifdef CONFIG_LIVEPATCH
+ ONE("patch_state", S_IRUSR, proc_pid_patch_state),
+#endif
};
static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx)
@@ -3315,6 +3327,9 @@ static const struct pid_entry tid_base_stuff[] = {
REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations),
REG("setgroups", S_IRUGO|S_IWUSR, proc_setgroups_operations),
#endif
+#ifdef CONFIG_LIVEPATCH
+ ONE("patch_state", S_IRUSR, proc_pid_patch_state),
+#endif
};
static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx)
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index ee27feb34cf4..9425c0d97262 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -472,6 +472,7 @@ struct proc_dir_entry *proc_create_mount_point(const char *name)
ent->data = NULL;
ent->proc_fops = NULL;
ent->proc_iops = NULL;
+ parent->nlink++;
if (proc_register(parent, ent) < 0) {
kfree(ent);
parent->nlink--;
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index d04ea4349909..67985a7233c2 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -408,10 +408,6 @@ static void next_entry(struct ctl_table_header **phead, struct ctl_table **pentr
*pentry = entry;
}
-void register_sysctl_root(struct ctl_table_root *root)
-{
-}
-
/*
* sysctl_perm does NOT grant the superuser all rights automatically, because
* some sysctl variables are readonly even to root.
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index f08bd31c1081..f0c8b33d99b1 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -441,6 +441,7 @@ struct mem_size_stats {
unsigned long private_dirty;
unsigned long referenced;
unsigned long anonymous;
+ unsigned long lazyfree;
unsigned long anonymous_thp;
unsigned long shmem_thp;
unsigned long swap;
@@ -457,8 +458,11 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page,
int i, nr = compound ? 1 << compound_order(page) : 1;
unsigned long size = nr * PAGE_SIZE;
- if (PageAnon(page))
+ if (PageAnon(page)) {
mss->anonymous += size;
+ if (!PageSwapBacked(page) && !dirty && !PageDirty(page))
+ mss->lazyfree += size;
+ }
mss->resident += size;
/* Accumulate the size in pages that have been accessed. */
@@ -771,6 +775,7 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
"Private_Dirty: %8lu kB\n"
"Referenced: %8lu kB\n"
"Anonymous: %8lu kB\n"
+ "LazyFree: %8lu kB\n"
"AnonHugePages: %8lu kB\n"
"ShmemPmdMapped: %8lu kB\n"
"Shared_Hugetlb: %8lu kB\n"
@@ -789,6 +794,7 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
mss.private_dirty >> 10,
mss.referenced >> 10,
mss.anonymous >> 10,
+ mss.lazyfree >> 10,
mss.anonymous_thp >> 10,
mss.shmem_thp >> 10,
mss.shared_hugetlb >> 10,
@@ -900,7 +906,14 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma,
static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
unsigned long addr, pmd_t *pmdp)
{
- pmd_t pmd = pmdp_huge_get_and_clear(vma->vm_mm, addr, pmdp);
+ pmd_t pmd = *pmdp;
+
+ /* See comment in change_huge_pmd() */
+ pmdp_invalidate(vma, addr, pmdp);
+ if (pmd_dirty(*pmdp))
+ pmd = pmd_mkdirty(pmd);
+ if (pmd_young(*pmdp))
+ pmd = pmd_mkyoung(pmd);
pmd = pmd_wrprotect(pmd);
pmd = pmd_clear_soft_dirty(pmd);
diff --git a/fs/pstore/ftrace.c b/fs/pstore/ftrace.c
index 899d0ba0bd6c..06aab07b6bb7 100644
--- a/fs/pstore/ftrace.c
+++ b/fs/pstore/ftrace.c
@@ -37,6 +37,12 @@ static void notrace pstore_ftrace_call(unsigned long ip,
{
unsigned long flags;
struct pstore_ftrace_record rec = {};
+ struct pstore_record record = {
+ .type = PSTORE_TYPE_FTRACE,
+ .buf = (char *)&rec,
+ .size = sizeof(rec),
+ .psi = psinfo,
+ };
if (unlikely(oops_in_progress))
return;
@@ -47,8 +53,7 @@ static void notrace pstore_ftrace_call(unsigned long ip,
rec.parent_ip = parent_ip;
pstore_ftrace_write_timestamp(&rec, pstore_ftrace_stamp++);
pstore_ftrace_encode_cpu(&rec, raw_smp_processor_id());
- psinfo->write_buf(PSTORE_TYPE_FTRACE, 0, NULL, 0, (void *)&rec,
- 0, sizeof(rec), psinfo);
+ psinfo->write(&record);
local_irq_restore(flags);
}
@@ -117,7 +122,7 @@ void pstore_register_ftrace(void)
{
struct dentry *file;
- if (!psinfo->write_buf)
+ if (!psinfo->write)
return;
pstore_ftrace_dir = debugfs_create_dir("pstore", NULL);
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index 57c0646479f5..792a4e5f9226 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -47,12 +47,8 @@ static LIST_HEAD(allpstore);
struct pstore_private {
struct list_head list;
- struct pstore_info *psi;
- enum pstore_type_id type;
- u64 id;
- int count;
- ssize_t size;
- char data[];
+ struct pstore_record *record;
+ size_t total_size;
};
struct pstore_ftrace_seq_data {
@@ -63,6 +59,17 @@ struct pstore_ftrace_seq_data {
#define REC_SIZE sizeof(struct pstore_ftrace_record)
+static void free_pstore_private(struct pstore_private *private)
+{
+ if (!private)
+ return;
+ if (private->record) {
+ kfree(private->record->buf);
+ kfree(private->record);
+ }
+ kfree(private);
+}
+
static void *pstore_ftrace_seq_start(struct seq_file *s, loff_t *pos)
{
struct pstore_private *ps = s->private;
@@ -72,9 +79,9 @@ static void *pstore_ftrace_seq_start(struct seq_file *s, loff_t *pos)
if (!data)
return NULL;
- data->off = ps->size % REC_SIZE;
+ data->off = ps->total_size % REC_SIZE;
data->off += *pos * REC_SIZE;
- if (data->off + REC_SIZE > ps->size) {
+ if (data->off + REC_SIZE > ps->total_size) {
kfree(data);
return NULL;
}
@@ -94,7 +101,7 @@ static void *pstore_ftrace_seq_next(struct seq_file *s, void *v, loff_t *pos)
struct pstore_ftrace_seq_data *data = v;
data->off += REC_SIZE;
- if (data->off + REC_SIZE > ps->size)
+ if (data->off + REC_SIZE > ps->total_size)
return NULL;
(*pos)++;
@@ -105,7 +112,9 @@ static int pstore_ftrace_seq_show(struct seq_file *s, void *v)
{
struct pstore_private *ps = s->private;
struct pstore_ftrace_seq_data *data = v;
- struct pstore_ftrace_record *rec = (void *)(ps->data + data->off);
+ struct pstore_ftrace_record *rec;
+
+ rec = (struct pstore_ftrace_record *)(ps->record->buf + data->off);
seq_printf(s, "CPU:%d ts:%llu %08lx %08lx %pf <- %pF\n",
pstore_ftrace_decode_cpu(rec),
@@ -125,7 +134,7 @@ static const struct seq_operations pstore_ftrace_seq_ops = {
static int pstore_check_syslog_permissions(struct pstore_private *ps)
{
- switch (ps->type) {
+ switch (ps->record->type) {
case PSTORE_TYPE_DMESG:
case PSTORE_TYPE_CONSOLE:
return check_syslog_permissions(SYSLOG_ACTION_READ_ALL,
@@ -141,9 +150,10 @@ static ssize_t pstore_file_read(struct file *file, char __user *userbuf,
struct seq_file *sf = file->private_data;
struct pstore_private *ps = sf->private;
- if (ps->type == PSTORE_TYPE_FTRACE)
+ if (ps->record->type == PSTORE_TYPE_FTRACE)
return seq_read(file, userbuf, count, ppos);
- return simple_read_from_buffer(userbuf, count, ppos, ps->data, ps->size);
+ return simple_read_from_buffer(userbuf, count, ppos,
+ ps->record->buf, ps->total_size);
}
static int pstore_file_open(struct inode *inode, struct file *file)
@@ -157,7 +167,7 @@ static int pstore_file_open(struct inode *inode, struct file *file)
if (err)
return err;
- if (ps->type == PSTORE_TYPE_FTRACE)
+ if (ps->record->type == PSTORE_TYPE_FTRACE)
sops = &pstore_ftrace_seq_ops;
err = seq_open(file, sops);
@@ -193,20 +203,19 @@ static const struct file_operations pstore_file_operations = {
static int pstore_unlink(struct inode *dir, struct dentry *dentry)
{
struct pstore_private *p = d_inode(dentry)->i_private;
+ struct pstore_record *record = p->record;
int err;
err = pstore_check_syslog_permissions(p);
if (err)
return err;
- if (p->psi->erase) {
- mutex_lock(&p->psi->read_mutex);
- p->psi->erase(p->type, p->id, p->count,
- d_inode(dentry)->i_ctime, p->psi);
- mutex_unlock(&p->psi->read_mutex);
- } else {
+ if (!record->psi->erase)
return -EPERM;
- }
+
+ mutex_lock(&record->psi->read_mutex);
+ record->psi->erase(record);
+ mutex_unlock(&record->psi->read_mutex);
return simple_unlink(dir, dentry);
}
@@ -221,7 +230,7 @@ static void pstore_evict_inode(struct inode *inode)
spin_lock_irqsave(&allpstore_lock, flags);
list_del(&p->list);
spin_unlock_irqrestore(&allpstore_lock, flags);
- kfree(p);
+ free_pstore_private(p);
}
}
@@ -302,23 +311,23 @@ bool pstore_is_mounted(void)
* Load it up with "size" bytes of data from "buf".
* Set the mtime & ctime to the date that this record was originally stored.
*/
-int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count,
- char *data, bool compressed, size_t size,
- struct timespec time, struct pstore_info *psi)
+int pstore_mkfile(struct dentry *root, struct pstore_record *record)
{
- struct dentry *root = pstore_sb->s_root;
struct dentry *dentry;
struct inode *inode;
int rc = 0;
char name[PSTORE_NAMELEN];
struct pstore_private *private, *pos;
unsigned long flags;
+ size_t size = record->size + record->ecc_notice_size;
+
+ WARN_ON(!inode_is_locked(d_inode(root)));
spin_lock_irqsave(&allpstore_lock, flags);
list_for_each_entry(pos, &allpstore, list) {
- if (pos->type == type &&
- pos->id == id &&
- pos->psi == psi) {
+ if (pos->record->type == record->type &&
+ pos->record->id == record->id &&
+ pos->record->psi == record->psi) {
rc = -EEXIST;
break;
}
@@ -328,72 +337,74 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count,
return rc;
rc = -ENOMEM;
- inode = pstore_get_inode(pstore_sb);
+ inode = pstore_get_inode(root->d_sb);
if (!inode)
goto fail;
inode->i_mode = S_IFREG | 0444;
inode->i_fop = &pstore_file_operations;
- private = kmalloc(sizeof *private + size, GFP_KERNEL);
+ private = kzalloc(sizeof(*private), GFP_KERNEL);
if (!private)
goto fail_alloc;
- private->type = type;
- private->id = id;
- private->count = count;
- private->psi = psi;
+ private->record = record;
- switch (type) {
+ switch (record->type) {
case PSTORE_TYPE_DMESG:
scnprintf(name, sizeof(name), "dmesg-%s-%lld%s",
- psname, id, compressed ? ".enc.z" : "");
+ record->psi->name, record->id,
+ record->compressed ? ".enc.z" : "");
break;
case PSTORE_TYPE_CONSOLE:
- scnprintf(name, sizeof(name), "console-%s-%lld", psname, id);
+ scnprintf(name, sizeof(name), "console-%s-%lld",
+ record->psi->name, record->id);
break;
case PSTORE_TYPE_FTRACE:
- scnprintf(name, sizeof(name), "ftrace-%s-%lld", psname, id);
+ scnprintf(name, sizeof(name), "ftrace-%s-%lld",
+ record->psi->name, record->id);
break;
case PSTORE_TYPE_MCE:
- scnprintf(name, sizeof(name), "mce-%s-%lld", psname, id);
+ scnprintf(name, sizeof(name), "mce-%s-%lld",
+ record->psi->name, record->id);
break;
case PSTORE_TYPE_PPC_RTAS:
- scnprintf(name, sizeof(name), "rtas-%s-%lld", psname, id);
+ scnprintf(name, sizeof(name), "rtas-%s-%lld",
+ record->psi->name, record->id);
break;
case PSTORE_TYPE_PPC_OF:
scnprintf(name, sizeof(name), "powerpc-ofw-%s-%lld",
- psname, id);
+ record->psi->name, record->id);
break;
case PSTORE_TYPE_PPC_COMMON:
scnprintf(name, sizeof(name), "powerpc-common-%s-%lld",
- psname, id);
+ record->psi->name, record->id);
break;
case PSTORE_TYPE_PMSG:
- scnprintf(name, sizeof(name), "pmsg-%s-%lld", psname, id);
+ scnprintf(name, sizeof(name), "pmsg-%s-%lld",
+ record->psi->name, record->id);
break;
case PSTORE_TYPE_PPC_OPAL:
- sprintf(name, "powerpc-opal-%s-%lld", psname, id);
+ scnprintf(name, sizeof(name), "powerpc-opal-%s-%lld",
+ record->psi->name, record->id);
break;
case PSTORE_TYPE_UNKNOWN:
- scnprintf(name, sizeof(name), "unknown-%s-%lld", psname, id);
+ scnprintf(name, sizeof(name), "unknown-%s-%lld",
+ record->psi->name, record->id);
break;
default:
scnprintf(name, sizeof(name), "type%d-%s-%lld",
- type, psname, id);
+ record->type, record->psi->name, record->id);
break;
}
- inode_lock(d_inode(root));
-
dentry = d_alloc_name(root, name);
if (!dentry)
- goto fail_lockedalloc;
+ goto fail_private;
- memcpy(private->data, data, size);
- inode->i_size = private->size = size;
+ inode->i_size = private->total_size = size;
inode->i_private = private;
- if (time.tv_sec)
- inode->i_mtime = inode->i_ctime = time;
+ if (record->time.tv_sec)
+ inode->i_mtime = inode->i_ctime = record->time;
d_add(dentry, inode);
@@ -401,13 +412,10 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count,
list_add(&private->list, &allpstore);
spin_unlock_irqrestore(&allpstore_lock, flags);
- inode_unlock(d_inode(root));
-
return 0;
-fail_lockedalloc:
- inode_unlock(d_inode(root));
- kfree(private);
+fail_private:
+ free_pstore_private(private);
fail_alloc:
iput(inode);
@@ -415,6 +423,27 @@ fail:
return rc;
}
+/*
+ * Read all the records from the persistent store. Create
+ * files in our filesystem. Don't warn about -EEXIST errors
+ * when we are re-scanning the backing store looking to add new
+ * error records.
+ */
+void pstore_get_records(int quiet)
+{
+ struct pstore_info *psi = psinfo;
+ struct dentry *root;
+
+ if (!psi || !pstore_sb)
+ return;
+
+ root = pstore_sb->s_root;
+
+ inode_lock(d_inode(root));
+ pstore_get_backend_records(psi, root, quiet);
+ inode_unlock(d_inode(root));
+}
+
static int pstore_fill_super(struct super_block *sb, void *data, int silent)
{
struct inode *inode;
diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h
index da416e6591c9..c416e653dc4f 100644
--- a/fs/pstore/internal.h
+++ b/fs/pstore/internal.h
@@ -25,10 +25,10 @@ extern struct pstore_info *psinfo;
extern void pstore_set_kmsg_bytes(int);
extern void pstore_get_records(int);
-extern int pstore_mkfile(enum pstore_type_id, char *psname, u64 id,
- int count, char *data, bool compressed,
- size_t size, struct timespec time,
- struct pstore_info *psi);
+extern void pstore_get_backend_records(struct pstore_info *psi,
+ struct dentry *root, int quiet);
+extern int pstore_mkfile(struct dentry *root,
+ struct pstore_record *record);
extern bool pstore_is_mounted(void);
#endif
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index efab7b64925b..d468eec9b8a6 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -267,7 +267,7 @@ static void free_zlib(void)
big_oops_buf_sz = 0;
}
-static struct pstore_zbackend backend_zlib = {
+static const struct pstore_zbackend backend_zlib = {
.compress = compress_zlib,
.decompress = decompress_zlib,
.allocate = allocate_zlib,
@@ -328,7 +328,7 @@ static void free_lzo(void)
big_oops_buf_sz = 0;
}
-static struct pstore_zbackend backend_lzo = {
+static const struct pstore_zbackend backend_lzo = {
.compress = compress_lzo,
.decompress = decompress_lzo,
.allocate = allocate_lzo,
@@ -393,7 +393,7 @@ static void free_lz4(void)
big_oops_buf_sz = 0;
}
-static struct pstore_zbackend backend_lz4 = {
+static const struct pstore_zbackend backend_lz4 = {
.compress = compress_lz4,
.decompress = decompress_lz4,
.allocate = allocate_lz4,
@@ -402,7 +402,7 @@ static struct pstore_zbackend backend_lz4 = {
};
#endif
-static struct pstore_zbackend *zbackend =
+static const struct pstore_zbackend *zbackend =
#if defined(CONFIG_PSTORE_ZLIB_COMPRESS)
&backend_zlib;
#elif defined(CONFIG_PSTORE_LZO_COMPRESS)
@@ -484,7 +484,6 @@ static void pstore_dump(struct kmsg_dumper *dumper,
{
unsigned long total = 0;
const char *why;
- u64 id;
unsigned int part = 1;
unsigned long flags = 0;
int is_locked;
@@ -506,48 +505,59 @@ static void pstore_dump(struct kmsg_dumper *dumper,
oopscount++;
while (total < kmsg_bytes) {
char *dst;
- unsigned long size;
- int hsize;
+ size_t dst_size;
+ int header_size;
int zipped_len = -1;
- size_t len;
- bool compressed = false;
- size_t total_len;
+ size_t dump_size;
+ struct pstore_record record = {
+ .type = PSTORE_TYPE_DMESG,
+ .count = oopscount,
+ .reason = reason,
+ .part = part,
+ .compressed = false,
+ .buf = psinfo->buf,
+ .psi = psinfo,
+ };
if (big_oops_buf && is_locked) {
dst = big_oops_buf;
- size = big_oops_buf_sz;
+ dst_size = big_oops_buf_sz;
} else {
dst = psinfo->buf;
- size = psinfo->bufsize;
+ dst_size = psinfo->bufsize;
}
- hsize = sprintf(dst, "%s#%d Part%u\n", why, oopscount, part);
- size -= hsize;
+ /* Write dump header. */
+ header_size = snprintf(dst, dst_size, "%s#%d Part%u\n", why,
+ oopscount, part);
+ dst_size -= header_size;
- if (!kmsg_dump_get_buffer(dumper, true, dst + hsize,
- size, &len))
+ /* Write dump contents. */
+ if (!kmsg_dump_get_buffer(dumper, true, dst + header_size,
+ dst_size, &dump_size))
break;
if (big_oops_buf && is_locked) {
zipped_len = pstore_compress(dst, psinfo->buf,
- hsize + len, psinfo->bufsize);
+ header_size + dump_size,
+ psinfo->bufsize);
if (zipped_len > 0) {
- compressed = true;
- total_len = zipped_len;
+ record.compressed = true;
+ record.size = zipped_len;
} else {
- total_len = copy_kmsg_to_buffer(hsize, len);
+ record.size = copy_kmsg_to_buffer(header_size,
+ dump_size);
}
} else {
- total_len = hsize + len;
+ record.size = header_size + dump_size;
}
- ret = psinfo->write(PSTORE_TYPE_DMESG, reason, &id, part,
- oopscount, compressed, total_len, psinfo);
+ ret = psinfo->write(&record);
if (ret == 0 && reason == KMSG_DUMP_OOPS && pstore_is_mounted())
pstore_new_entry = 1;
- total += total_len;
+ total += record.size;
part++;
}
if (is_locked)
@@ -577,8 +587,11 @@ static void pstore_console_write(struct console *con, const char *s, unsigned c)
const char *e = s + c;
while (s < e) {
+ struct pstore_record record = {
+ .type = PSTORE_TYPE_CONSOLE,
+ .psi = psinfo,
+ };
unsigned long flags;
- u64 id;
if (c > psinfo->bufsize)
c = psinfo->bufsize;
@@ -589,8 +602,9 @@ static void pstore_console_write(struct console *con, const char *s, unsigned c)
} else {
spin_lock_irqsave(&psinfo->buf_lock, flags);
}
- psinfo->write_buf(PSTORE_TYPE_CONSOLE, 0, &id, 0,
- s, 0, c, psinfo);
+ record.buf = (char *)s;
+ record.size = c;
+ psinfo->write(&record);
spin_unlock_irqrestore(&psinfo->buf_lock, flags);
s += c;
c = e - s;
@@ -618,48 +632,30 @@ static void pstore_register_console(void) {}
static void pstore_unregister_console(void) {}
#endif
-static int pstore_write_compat(enum pstore_type_id type,
- enum kmsg_dump_reason reason,
- u64 *id, unsigned int part, int count,
- bool compressed, size_t size,
- struct pstore_info *psi)
-{
- return psi->write_buf(type, reason, id, part, psinfo->buf, compressed,
- size, psi);
-}
-
-static int pstore_write_buf_user_compat(enum pstore_type_id type,
- enum kmsg_dump_reason reason,
- u64 *id, unsigned int part,
- const char __user *buf,
- bool compressed, size_t size,
- struct pstore_info *psi)
-{
- unsigned long flags = 0;
- size_t i, bufsize = size;
- long ret = 0;
-
- if (unlikely(!access_ok(VERIFY_READ, buf, size)))
- return -EFAULT;
- if (bufsize > psinfo->bufsize)
- bufsize = psinfo->bufsize;
- spin_lock_irqsave(&psinfo->buf_lock, flags);
- for (i = 0; i < size; ) {
- size_t c = min(size - i, bufsize);
-
- ret = __copy_from_user(psinfo->buf, buf + i, c);
- if (unlikely(ret != 0)) {
- ret = -EFAULT;
- break;
- }
- ret = psi->write_buf(type, reason, id, part, psinfo->buf,
- compressed, c, psi);
- if (unlikely(ret < 0))
- break;
- i += c;
+static int pstore_write_user_compat(struct pstore_record *record,
+ const char __user *buf)
+{
+ int ret = 0;
+
+ if (record->buf)
+ return -EINVAL;
+
+ record->buf = kmalloc(record->size, GFP_KERNEL);
+ if (!record->buf)
+ return -ENOMEM;
+
+ if (unlikely(copy_from_user(record->buf, buf, record->size))) {
+ ret = -EFAULT;
+ goto out;
}
- spin_unlock_irqrestore(&psinfo->buf_lock, flags);
- return unlikely(ret < 0) ? ret : size;
+
+ ret = record->psi->write(record);
+
+out:
+ kfree(record->buf);
+ record->buf = NULL;
+
+ return unlikely(ret < 0) ? ret : record->size;
}
/*
@@ -673,19 +669,35 @@ int pstore_register(struct pstore_info *psi)
{
struct module *owner = psi->owner;
- if (backend && strcmp(backend, psi->name))
+ if (backend && strcmp(backend, psi->name)) {
+ pr_warn("ignoring unexpected backend '%s'\n", psi->name);
return -EPERM;
+ }
+
+ /* Sanity check flags. */
+ if (!psi->flags) {
+ pr_warn("backend '%s' must support at least one frontend\n",
+ psi->name);
+ return -EINVAL;
+ }
+
+ /* Check for required functions. */
+ if (!psi->read || !psi->write) {
+ pr_warn("backend '%s' must implement read() and write()\n",
+ psi->name);
+ return -EINVAL;
+ }
spin_lock(&pstore_lock);
if (psinfo) {
+ pr_warn("backend '%s' already loaded: ignoring '%s'\n",
+ psinfo->name, psi->name);
spin_unlock(&pstore_lock);
return -EBUSY;
}
- if (!psi->write)
- psi->write = pstore_write_compat;
- if (!psi->write_buf_user)
- psi->write_buf_user = pstore_write_buf_user_compat;
+ if (!psi->write_user)
+ psi->write_user = pstore_write_user_compat;
psinfo = psi;
mutex_init(&psinfo->read_mutex);
spin_unlock(&pstore_lock);
@@ -709,6 +721,7 @@ int pstore_register(struct pstore_info *psi)
if (psi->flags & PSTORE_FLAGS_PMSG)
pstore_register_pmsg();
+ /* Start watching for new records, if desired. */
if (pstore_update_ms >= 0) {
pstore_timer.expires = jiffies +
msecs_to_jiffies(pstore_update_ms);
@@ -721,16 +734,21 @@ int pstore_register(struct pstore_info *psi)
*/
backend = psi->name;
- module_put(owner);
-
pr_info("Registered %s as persistent store backend\n", psi->name);
+ module_put(owner);
+
return 0;
}
EXPORT_SYMBOL_GPL(pstore_register);
void pstore_unregister(struct pstore_info *psi)
{
+ /* Stop timer and make sure all work has finished. */
+ pstore_update_ms = -1;
+ del_timer_sync(&pstore_timer);
+ flush_work(&pstore_work);
+
if (psi->flags & PSTORE_FLAGS_PMSG)
pstore_unregister_pmsg();
if (psi->flags & PSTORE_FLAGS_FTRACE)
@@ -747,66 +765,99 @@ void pstore_unregister(struct pstore_info *psi)
}
EXPORT_SYMBOL_GPL(pstore_unregister);
+static void decompress_record(struct pstore_record *record)
+{
+ int unzipped_len;
+ char *decompressed;
+
+ /* Only PSTORE_TYPE_DMESG support compression. */
+ if (!record->compressed || record->type != PSTORE_TYPE_DMESG) {
+ pr_warn("ignored compressed record type %d\n", record->type);
+ return;
+ }
+
+ /* No compression method has created the common buffer. */
+ if (!big_oops_buf) {
+ pr_warn("no decompression buffer allocated\n");
+ return;
+ }
+
+ unzipped_len = pstore_decompress(record->buf, big_oops_buf,
+ record->size, big_oops_buf_sz);
+ if (unzipped_len <= 0) {
+ pr_err("decompression failed: %d\n", unzipped_len);
+ return;
+ }
+
+ /* Build new buffer for decompressed contents. */
+ decompressed = kmalloc(unzipped_len + record->ecc_notice_size,
+ GFP_KERNEL);
+ if (!decompressed) {
+ pr_err("decompression ran out of memory\n");
+ return;
+ }
+ memcpy(decompressed, big_oops_buf, unzipped_len);
+
+ /* Append ECC notice to decompressed buffer. */
+ memcpy(decompressed + unzipped_len, record->buf + record->size,
+ record->ecc_notice_size);
+
+ /* Swap out compresed contents with decompressed contents. */
+ kfree(record->buf);
+ record->buf = decompressed;
+ record->size = unzipped_len;
+ record->compressed = false;
+}
+
/*
- * Read all the records from the persistent store. Create
+ * Read all the records from one persistent store backend. Create
* files in our filesystem. Don't warn about -EEXIST errors
* when we are re-scanning the backing store looking to add new
* error records.
*/
-void pstore_get_records(int quiet)
-{
- struct pstore_info *psi = psinfo;
- char *buf = NULL;
- ssize_t size;
- u64 id;
- int count;
- enum pstore_type_id type;
- struct timespec time;
- int failed = 0, rc;
- bool compressed;
- int unzipped_len = -1;
- ssize_t ecc_notice_size = 0;
-
- if (!psi)
+void pstore_get_backend_records(struct pstore_info *psi,
+ struct dentry *root, int quiet)
+{
+ int failed = 0;
+
+ if (!psi || !root)
return;
mutex_lock(&psi->read_mutex);
if (psi->open && psi->open(psi))
goto out;
- while ((size = psi->read(&id, &type, &count, &time, &buf, &compressed,
- &ecc_notice_size, psi)) > 0) {
- if (compressed && (type == PSTORE_TYPE_DMESG)) {
- if (big_oops_buf)
- unzipped_len = pstore_decompress(buf,
- big_oops_buf, size,
- big_oops_buf_sz);
-
- if (unzipped_len > 0) {
- if (ecc_notice_size)
- memcpy(big_oops_buf + unzipped_len,
- buf + size, ecc_notice_size);
- kfree(buf);
- buf = big_oops_buf;
- size = unzipped_len;
- compressed = false;
- } else {
- pr_err("decompression failed;returned %d\n",
- unzipped_len);
- compressed = true;
- }
+ /*
+ * Backend callback read() allocates record.buf. decompress_record()
+ * may reallocate record.buf. On success, pstore_mkfile() will keep
+ * the record.buf, so free it only on failure.
+ */
+ for (;;) {
+ struct pstore_record *record;
+ int rc;
+
+ record = kzalloc(sizeof(*record), GFP_KERNEL);
+ if (!record) {
+ pr_err("out of memory creating record\n");
+ break;
+ }
+ record->psi = psi;
+
+ record->size = psi->read(record);
+
+ /* No more records left in backend? */
+ if (record->size <= 0)
+ break;
+
+ decompress_record(record);
+ rc = pstore_mkfile(root, record);
+ if (rc) {
+ /* pstore_mkfile() did not take record, so free it. */
+ kfree(record->buf);
+ kfree(record);
+ if (rc != -EEXIST || !quiet)
+ failed++;
}
- rc = pstore_mkfile(type, psi->name, id, count, buf,
- compressed, size + ecc_notice_size,
- time, psi);
- if (unzipped_len < 0) {
- /* Free buffer other than big oops */
- kfree(buf);
- buf = NULL;
- } else
- unzipped_len = -1;
- if (rc && (rc != -EEXIST || !quiet))
- failed++;
}
if (psi->close)
psi->close(psi);
@@ -830,7 +881,9 @@ static void pstore_timefunc(unsigned long dummy)
schedule_work(&pstore_work);
}
- mod_timer(&pstore_timer, jiffies + msecs_to_jiffies(pstore_update_ms));
+ if (pstore_update_ms >= 0)
+ mod_timer(&pstore_timer,
+ jiffies + msecs_to_jiffies(pstore_update_ms));
}
module_param(backend, charp, 0444);
diff --git a/fs/pstore/pmsg.c b/fs/pstore/pmsg.c
index 78f6176c020f..209755e0d7c8 100644
--- a/fs/pstore/pmsg.c
+++ b/fs/pstore/pmsg.c
@@ -15,7 +15,6 @@
#include <linux/device.h>
#include <linux/fs.h>
#include <linux/uaccess.h>
-#include <linux/vmalloc.h>
#include "internal.h"
static DEFINE_MUTEX(pmsg_lock);
@@ -23,19 +22,22 @@ static DEFINE_MUTEX(pmsg_lock);
static ssize_t write_pmsg(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
- u64 id;
+ struct pstore_record record = {
+ .type = PSTORE_TYPE_PMSG,
+ .size = count,
+ .psi = psinfo,
+ };
int ret;
if (!count)
return 0;
- /* check outside lock, page in any data. write_buf_user also checks */
+ /* check outside lock, page in any data. write_user also checks */
if (!access_ok(VERIFY_READ, buf, count))
return -EFAULT;
mutex_lock(&pmsg_lock);
- ret = psinfo->write_buf_user(PSTORE_TYPE_PMSG, 0, &id, 0, buf, 0, count,
- psinfo);
+ ret = psinfo->write_user(&record, buf);
mutex_unlock(&pmsg_lock);
return ret ? ret : count;
}
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index 11f918d34b1e..5523df7f17ef 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -235,35 +235,34 @@ static ssize_t ftrace_log_combine(struct persistent_ram_zone *dest,
return 0;
}
-static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type,
- int *count, struct timespec *time,
- char **buf, bool *compressed,
- ssize_t *ecc_notice_size,
- struct pstore_info *psi)
+static ssize_t ramoops_pstore_read(struct pstore_record *record)
{
ssize_t size = 0;
- struct ramoops_context *cxt = psi->data;
+ struct ramoops_context *cxt = record->psi->data;
struct persistent_ram_zone *prz = NULL;
int header_length = 0;
bool free_prz = false;
- /* Ramoops headers provide time stamps for PSTORE_TYPE_DMESG, but
+ /*
+ * Ramoops headers provide time stamps for PSTORE_TYPE_DMESG, but
* PSTORE_TYPE_CONSOLE and PSTORE_TYPE_FTRACE don't currently have
* valid time stamps, so it is initialized to zero.
*/
- time->tv_sec = 0;
- time->tv_nsec = 0;
- *compressed = false;
+ record->time.tv_sec = 0;
+ record->time.tv_nsec = 0;
+ record->compressed = false;
/* Find the next valid persistent_ram_zone for DMESG */
while (cxt->dump_read_cnt < cxt->max_dump_cnt && !prz) {
prz = ramoops_get_next_prz(cxt->dprzs, &cxt->dump_read_cnt,
- cxt->max_dump_cnt, id, type,
+ cxt->max_dump_cnt, &record->id,
+ &record->type,
PSTORE_TYPE_DMESG, 1);
if (!prz_ok(prz))
continue;
header_length = ramoops_read_kmsg_hdr(persistent_ram_old(prz),
- time, compressed);
+ &record->time,
+ &record->compressed);
/* Clear and skip this DMESG record if it has no valid header */
if (!header_length) {
persistent_ram_free_old(prz);
@@ -274,18 +273,20 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type,
if (!prz_ok(prz))
prz = ramoops_get_next_prz(&cxt->cprz, &cxt->console_read_cnt,
- 1, id, type, PSTORE_TYPE_CONSOLE, 0);
+ 1, &record->id, &record->type,
+ PSTORE_TYPE_CONSOLE, 0);
if (!prz_ok(prz))
prz = ramoops_get_next_prz(&cxt->mprz, &cxt->pmsg_read_cnt,
- 1, id, type, PSTORE_TYPE_PMSG, 0);
+ 1, &record->id, &record->type,
+ PSTORE_TYPE_PMSG, 0);
/* ftrace is last since it may want to dynamically allocate memory. */
if (!prz_ok(prz)) {
if (!(cxt->flags & RAMOOPS_FLAG_FTRACE_PER_CPU)) {
prz = ramoops_get_next_prz(cxt->fprzs,
- &cxt->ftrace_read_cnt, 1, id, type,
- PSTORE_TYPE_FTRACE, 0);
+ &cxt->ftrace_read_cnt, 1, &record->id,
+ &record->type, PSTORE_TYPE_FTRACE, 0);
} else {
/*
* Build a new dummy record which combines all the
@@ -302,8 +303,10 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type,
while (cxt->ftrace_read_cnt < cxt->max_ftrace_cnt) {
prz_next = ramoops_get_next_prz(cxt->fprzs,
&cxt->ftrace_read_cnt,
- cxt->max_ftrace_cnt, id,
- type, PSTORE_TYPE_FTRACE, 0);
+ cxt->max_ftrace_cnt,
+ &record->id,
+ &record->type,
+ PSTORE_TYPE_FTRACE, 0);
if (!prz_ok(prz_next))
continue;
@@ -316,7 +319,7 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type,
if (size)
goto out;
}
- *id = 0;
+ record->id = 0;
prz = tmp_prz;
}
}
@@ -329,17 +332,19 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type,
size = persistent_ram_old_size(prz) - header_length;
/* ECC correction notice */
- *ecc_notice_size = persistent_ram_ecc_string(prz, NULL, 0);
+ record->ecc_notice_size = persistent_ram_ecc_string(prz, NULL, 0);
- *buf = kmalloc(size + *ecc_notice_size + 1, GFP_KERNEL);
- if (*buf == NULL) {
+ record->buf = kmalloc(size + record->ecc_notice_size + 1, GFP_KERNEL);
+ if (record->buf == NULL) {
size = -ENOMEM;
goto out;
}
- memcpy(*buf, (char *)persistent_ram_old(prz) + header_length, size);
+ memcpy(record->buf, (char *)persistent_ram_old(prz) + header_length,
+ size);
- persistent_ram_ecc_string(prz, *buf + size, *ecc_notice_size + 1);
+ persistent_ram_ecc_string(prz, record->buf + size,
+ record->ecc_notice_size + 1);
out:
if (free_prz) {
@@ -373,23 +378,18 @@ static size_t ramoops_write_kmsg_hdr(struct persistent_ram_zone *prz,
return len;
}
-static int notrace ramoops_pstore_write_buf(enum pstore_type_id type,
- enum kmsg_dump_reason reason,
- u64 *id, unsigned int part,
- const char *buf,
- bool compressed, size_t size,
- struct pstore_info *psi)
+static int notrace ramoops_pstore_write(struct pstore_record *record)
{
- struct ramoops_context *cxt = psi->data;
+ struct ramoops_context *cxt = record->psi->data;
struct persistent_ram_zone *prz;
- size_t hlen;
+ size_t size, hlen;
- if (type == PSTORE_TYPE_CONSOLE) {
+ if (record->type == PSTORE_TYPE_CONSOLE) {
if (!cxt->cprz)
return -ENOMEM;
- persistent_ram_write(cxt->cprz, buf, size);
+ persistent_ram_write(cxt->cprz, record->buf, record->size);
return 0;
- } else if (type == PSTORE_TYPE_FTRACE) {
+ } else if (record->type == PSTORE_TYPE_FTRACE) {
int zonenum;
if (!cxt->fprzs)
@@ -402,33 +402,36 @@ static int notrace ramoops_pstore_write_buf(enum pstore_type_id type,
else
zonenum = 0;
- persistent_ram_write(cxt->fprzs[zonenum], buf, size);
+ persistent_ram_write(cxt->fprzs[zonenum], record->buf,
+ record->size);
return 0;
- } else if (type == PSTORE_TYPE_PMSG) {
+ } else if (record->type == PSTORE_TYPE_PMSG) {
pr_warn_ratelimited("PMSG shouldn't call %s\n", __func__);
return -EINVAL;
}
- if (type != PSTORE_TYPE_DMESG)
+ if (record->type != PSTORE_TYPE_DMESG)
return -EINVAL;
- /* Out of the various dmesg dump types, ramoops is currently designed
+ /*
+ * Out of the various dmesg dump types, ramoops is currently designed
* to only store crash logs, rather than storing general kernel logs.
*/
- if (reason != KMSG_DUMP_OOPS &&
- reason != KMSG_DUMP_PANIC)
+ if (record->reason != KMSG_DUMP_OOPS &&
+ record->reason != KMSG_DUMP_PANIC)
return -EINVAL;
/* Skip Oopes when configured to do so. */
- if (reason == KMSG_DUMP_OOPS && !cxt->dump_oops)
+ if (record->reason == KMSG_DUMP_OOPS && !cxt->dump_oops)
return -EINVAL;
- /* Explicitly only take the first part of any new crash.
+ /*
+ * Explicitly only take the first part of any new crash.
* If our buffer is larger than kmsg_bytes, this can never happen,
* and if our buffer is smaller than kmsg_bytes, we don't want the
* report split across multiple records.
*/
- if (part != 1)
+ if (record->part != 1)
return -ENOSPC;
if (!cxt->dprzs)
@@ -436,53 +439,50 @@ static int notrace ramoops_pstore_write_buf(enum pstore_type_id type,
prz = cxt->dprzs[cxt->dump_write_cnt];
- hlen = ramoops_write_kmsg_hdr(prz, compressed);
+ /* Build header and append record contents. */
+ hlen = ramoops_write_kmsg_hdr(prz, record->compressed);
+ size = record->size;
if (size + hlen > prz->buffer_size)
size = prz->buffer_size - hlen;
- persistent_ram_write(prz, buf, size);
+ persistent_ram_write(prz, record->buf, size);
cxt->dump_write_cnt = (cxt->dump_write_cnt + 1) % cxt->max_dump_cnt;
return 0;
}
-static int notrace ramoops_pstore_write_buf_user(enum pstore_type_id type,
- enum kmsg_dump_reason reason,
- u64 *id, unsigned int part,
- const char __user *buf,
- bool compressed, size_t size,
- struct pstore_info *psi)
+static int notrace ramoops_pstore_write_user(struct pstore_record *record,
+ const char __user *buf)
{
- if (type == PSTORE_TYPE_PMSG) {
- struct ramoops_context *cxt = psi->data;
+ if (record->type == PSTORE_TYPE_PMSG) {
+ struct ramoops_context *cxt = record->psi->data;
if (!cxt->mprz)
return -ENOMEM;
- return persistent_ram_write_user(cxt->mprz, buf, size);
+ return persistent_ram_write_user(cxt->mprz, buf, record->size);
}
return -EINVAL;
}
-static int ramoops_pstore_erase(enum pstore_type_id type, u64 id, int count,
- struct timespec time, struct pstore_info *psi)
+static int ramoops_pstore_erase(struct pstore_record *record)
{
- struct ramoops_context *cxt = psi->data;
+ struct ramoops_context *cxt = record->psi->data;
struct persistent_ram_zone *prz;
- switch (type) {
+ switch (record->type) {
case PSTORE_TYPE_DMESG:
- if (id >= cxt->max_dump_cnt)
+ if (record->id >= cxt->max_dump_cnt)
return -EINVAL;
- prz = cxt->dprzs[id];
+ prz = cxt->dprzs[record->id];
break;
case PSTORE_TYPE_CONSOLE:
prz = cxt->cprz;
break;
case PSTORE_TYPE_FTRACE:
- if (id >= cxt->max_ftrace_cnt)
+ if (record->id >= cxt->max_ftrace_cnt)
return -EINVAL;
- prz = cxt->fprzs[id];
+ prz = cxt->fprzs[record->id];
break;
case PSTORE_TYPE_PMSG:
prz = cxt->mprz;
@@ -503,8 +503,8 @@ static struct ramoops_context oops_cxt = {
.name = "ramoops",
.open = ramoops_pstore_open,
.read = ramoops_pstore_read,
- .write_buf = ramoops_pstore_write_buf,
- .write_buf_user = ramoops_pstore_write_buf_user,
+ .write = ramoops_pstore_write,
+ .write_user = ramoops_pstore_write_user,
.erase = ramoops_pstore_erase,
},
};
diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c
index bc927e30bdcc..e11672aa4575 100644
--- a/fs/pstore/ram_core.c
+++ b/fs/pstore/ram_core.c
@@ -532,7 +532,7 @@ struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size,
}
/* Initialize general buffer state. */
- prz->buffer_lock = __RAW_SPIN_LOCK_UNLOCKED(buffer_lock);
+ raw_spin_lock_init(&prz->buffer_lock);
prz->flags = flags;
ret = persistent_ram_buffer_map(start, size, prz, memtype);
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 74b489e3714d..ebf80c7739e1 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -2188,8 +2188,7 @@ int dquot_disable(struct super_block *sb, int type, unsigned int flags)
/* This can happen when suspending quotas on remount-ro... */
if (toputinode[cnt] && !sb_has_quota_loaded(sb, cnt)) {
inode_lock(toputinode[cnt]);
- toputinode[cnt]->i_flags &= ~(S_IMMUTABLE |
- S_NOATIME | S_NOQUOTA);
+ toputinode[cnt]->i_flags &= ~S_NOQUOTA;
truncate_inode_pages(&toputinode[cnt]->i_data, 0);
inode_unlock(toputinode[cnt]);
mark_inode_dirty_sync(toputinode[cnt]);
@@ -2237,7 +2236,6 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
struct super_block *sb = inode->i_sb;
struct quota_info *dqopt = sb_dqopt(sb);
int error;
- int oldflags = -1;
if (!fmt)
return -ESRCH;
@@ -2285,9 +2283,7 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
* possible) Also nobody should write to the file - we use
* special IO operations which ignore the immutable bit. */
inode_lock(inode);
- oldflags = inode->i_flags & (S_NOATIME | S_IMMUTABLE |
- S_NOQUOTA);
- inode->i_flags |= S_NOQUOTA | S_NOATIME | S_IMMUTABLE;
+ inode->i_flags |= S_NOQUOTA;
inode_unlock(inode);
/*
* When S_NOQUOTA is set, remove dquot references as no more
@@ -2329,14 +2325,9 @@ out_file_init:
dqopt->files[type] = NULL;
iput(inode);
out_file_flags:
- if (oldflags != -1) {
- inode_lock(inode);
- /* Set the flags back (in the case of accidental quotaon()
- * on a wrong file we don't want to mess up the flags) */
- inode->i_flags &= ~(S_NOATIME | S_NOQUOTA | S_IMMUTABLE);
- inode->i_flags |= oldflags;
- inode_unlock(inode);
- }
+ inode_lock(inode);
+ inode->i_flags &= ~S_NOQUOTA;
+ inode_unlock(inode);
out_fmt:
put_quota_format(fmt);
@@ -2780,18 +2771,6 @@ int dquot_set_dqinfo(struct super_block *sb, int type, struct qc_info *ii)
}
EXPORT_SYMBOL(dquot_set_dqinfo);
-const struct quotactl_ops dquot_quotactl_ops = {
- .quota_on = dquot_quota_on,
- .quota_off = dquot_quota_off,
- .quota_sync = dquot_quota_sync,
- .get_state = dquot_get_state,
- .set_info = dquot_set_dqinfo,
- .get_dqblk = dquot_get_dqblk,
- .get_nextdqblk = dquot_get_next_dqblk,
- .set_dqblk = dquot_set_dqblk
-};
-EXPORT_SYMBOL(dquot_quotactl_ops);
-
const struct quotactl_ops dquot_quotactl_sysfile_ops = {
.quota_enable = dquot_quota_enable,
.quota_disable = dquot_quota_disable,
diff --git a/fs/read_write.c b/fs/read_write.c
index c4f88afbc67f..47c1d4484df9 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -841,6 +841,81 @@ out:
return ret;
}
+#ifdef CONFIG_COMPAT
+ssize_t compat_rw_copy_check_uvector(int type,
+ const struct compat_iovec __user *uvector, unsigned long nr_segs,
+ unsigned long fast_segs, struct iovec *fast_pointer,
+ struct iovec **ret_pointer)
+{
+ compat_ssize_t tot_len;
+ struct iovec *iov = *ret_pointer = fast_pointer;
+ ssize_t ret = 0;
+ int seg;
+
+ /*
+ * SuS says "The readv() function *may* fail if the iovcnt argument
+ * was less than or equal to 0, or greater than {IOV_MAX}. Linux has
+ * traditionally returned zero for zero segments, so...
+ */
+ if (nr_segs == 0)
+ goto out;
+
+ ret = -EINVAL;
+ if (nr_segs > UIO_MAXIOV)
+ goto out;
+ if (nr_segs > fast_segs) {
+ ret = -ENOMEM;
+ iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
+ if (iov == NULL)
+ goto out;
+ }
+ *ret_pointer = iov;
+
+ ret = -EFAULT;
+ if (!access_ok(VERIFY_READ, uvector, nr_segs*sizeof(*uvector)))
+ goto out;
+
+ /*
+ * Single unix specification:
+ * We should -EINVAL if an element length is not >= 0 and fitting an
+ * ssize_t.
+ *
+ * In Linux, the total length is limited to MAX_RW_COUNT, there is
+ * no overflow possibility.
+ */
+ tot_len = 0;
+ ret = -EINVAL;
+ for (seg = 0; seg < nr_segs; seg++) {
+ compat_uptr_t buf;
+ compat_ssize_t len;
+
+ if (__get_user(len, &uvector->iov_len) ||
+ __get_user(buf, &uvector->iov_base)) {
+ ret = -EFAULT;
+ goto out;
+ }
+ if (len < 0) /* size_t not fitting in compat_ssize_t .. */
+ goto out;
+ if (type >= 0 &&
+ !access_ok(vrfy_dir(type), compat_ptr(buf), len)) {
+ ret = -EFAULT;
+ goto out;
+ }
+ if (len > MAX_RW_COUNT - tot_len)
+ len = MAX_RW_COUNT - tot_len;
+ tot_len += len;
+ iov->iov_base = compat_ptr(buf);
+ iov->iov_len = (compat_size_t) len;
+ uvector++;
+ iov++;
+ }
+ ret = tot_len;
+
+out:
+ return ret;
+}
+#endif
+
static ssize_t __do_readv_writev(int type, struct file *file,
struct iov_iter *iter, loff_t *pos, int flags)
{
diff --git a/fs/readdir.c b/fs/readdir.c
index 0e8a7f355f7a..89659549c09d 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -18,6 +18,7 @@
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/unistd.h>
+#include <linux/compat.h>
#include <linux/uaccess.h>
@@ -324,3 +325,167 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd,
fdput_pos(f);
return error;
}
+
+#ifdef CONFIG_COMPAT
+struct compat_old_linux_dirent {
+ compat_ulong_t d_ino;
+ compat_ulong_t d_offset;
+ unsigned short d_namlen;
+ char d_name[1];
+};
+
+struct compat_readdir_callback {
+ struct dir_context ctx;
+ struct compat_old_linux_dirent __user *dirent;
+ int result;
+};
+
+static int compat_fillonedir(struct dir_context *ctx, const char *name,
+ int namlen, loff_t offset, u64 ino,
+ unsigned int d_type)
+{
+ struct compat_readdir_callback *buf =
+ container_of(ctx, struct compat_readdir_callback, ctx);
+ struct compat_old_linux_dirent __user *dirent;
+ compat_ulong_t d_ino;
+
+ if (buf->result)
+ return -EINVAL;
+ d_ino = ino;
+ if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) {
+ buf->result = -EOVERFLOW;
+ return -EOVERFLOW;
+ }
+ buf->result++;
+ dirent = buf->dirent;
+ if (!access_ok(VERIFY_WRITE, dirent,
+ (unsigned long)(dirent->d_name + namlen + 1) -
+ (unsigned long)dirent))
+ goto efault;
+ if ( __put_user(d_ino, &dirent->d_ino) ||
+ __put_user(offset, &dirent->d_offset) ||
+ __put_user(namlen, &dirent->d_namlen) ||
+ __copy_to_user(dirent->d_name, name, namlen) ||
+ __put_user(0, dirent->d_name + namlen))
+ goto efault;
+ return 0;
+efault:
+ buf->result = -EFAULT;
+ return -EFAULT;
+}
+
+COMPAT_SYSCALL_DEFINE3(old_readdir, unsigned int, fd,
+ struct compat_old_linux_dirent __user *, dirent, unsigned int, count)
+{
+ int error;
+ struct fd f = fdget_pos(fd);
+ struct compat_readdir_callback buf = {
+ .ctx.actor = compat_fillonedir,
+ .dirent = dirent
+ };
+
+ if (!f.file)
+ return -EBADF;
+
+ error = iterate_dir(f.file, &buf.ctx);
+ if (buf.result)
+ error = buf.result;
+
+ fdput_pos(f);
+ return error;
+}
+
+struct compat_linux_dirent {
+ compat_ulong_t d_ino;
+ compat_ulong_t d_off;
+ unsigned short d_reclen;
+ char d_name[1];
+};
+
+struct compat_getdents_callback {
+ struct dir_context ctx;
+ struct compat_linux_dirent __user *current_dir;
+ struct compat_linux_dirent __user *previous;
+ int count;
+ int error;
+};
+
+static int compat_filldir(struct dir_context *ctx, const char *name, int namlen,
+ loff_t offset, u64 ino, unsigned int d_type)
+{
+ struct compat_linux_dirent __user * dirent;
+ struct compat_getdents_callback *buf =
+ container_of(ctx, struct compat_getdents_callback, ctx);
+ compat_ulong_t d_ino;
+ int reclen = ALIGN(offsetof(struct compat_linux_dirent, d_name) +
+ namlen + 2, sizeof(compat_long_t));
+
+ buf->error = -EINVAL; /* only used if we fail.. */
+ if (reclen > buf->count)
+ return -EINVAL;
+ d_ino = ino;
+ if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) {
+ buf->error = -EOVERFLOW;
+ return -EOVERFLOW;
+ }
+ dirent = buf->previous;
+ if (dirent) {
+ if (signal_pending(current))
+ return -EINTR;
+ if (__put_user(offset, &dirent->d_off))
+ goto efault;
+ }
+ dirent = buf->current_dir;
+ if (__put_user(d_ino, &dirent->d_ino))
+ goto efault;
+ if (__put_user(reclen, &dirent->d_reclen))
+ goto efault;
+ if (copy_to_user(dirent->d_name, name, namlen))
+ goto efault;
+ if (__put_user(0, dirent->d_name + namlen))
+ goto efault;
+ if (__put_user(d_type, (char __user *) dirent + reclen - 1))
+ goto efault;
+ buf->previous = dirent;
+ dirent = (void __user *)dirent + reclen;
+ buf->current_dir = dirent;
+ buf->count -= reclen;
+ return 0;
+efault:
+ buf->error = -EFAULT;
+ return -EFAULT;
+}
+
+COMPAT_SYSCALL_DEFINE3(getdents, unsigned int, fd,
+ struct compat_linux_dirent __user *, dirent, unsigned int, count)
+{
+ struct fd f;
+ struct compat_linux_dirent __user * lastdirent;
+ struct compat_getdents_callback buf = {
+ .ctx.actor = compat_filldir,
+ .current_dir = dirent,
+ .count = count
+ };
+ int error;
+
+ if (!access_ok(VERIFY_WRITE, dirent, count))
+ return -EFAULT;
+
+ f = fdget_pos(fd);
+ if (!f.file)
+ return -EBADF;
+
+ error = iterate_dir(f.file, &buf.ctx);
+ if (error >= 0)
+ error = buf.error;
+ lastdirent = buf.previous;
+ if (lastdirent) {
+ if (put_user(buf.ctx.pos, &lastdirent->d_off))
+ error = -EFAULT;
+ else
+ error = count - buf.count;
+ }
+ fdput_pos(f);
+ return error;
+}
+#endif
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index a6ab9d64ea1b..873fc04e9403 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -1375,7 +1375,6 @@ static void init_inode(struct inode *inode, struct treepath *path)
static void inode2sd(void *sd, struct inode *inode, loff_t size)
{
struct stat_data *sd_v2 = (struct stat_data *)sd;
- __u16 flags;
set_sd_v2_mode(sd_v2, inode->i_mode);
set_sd_v2_nlink(sd_v2, inode->i_nlink);
@@ -1390,9 +1389,7 @@ static void inode2sd(void *sd, struct inode *inode, loff_t size)
set_sd_v2_rdev(sd_v2, new_encode_dev(inode->i_rdev));
else
set_sd_v2_generation(sd_v2, inode->i_generation);
- flags = REISERFS_I(inode)->i_attrs;
- i_attrs_to_sd_attrs(inode, &flags);
- set_sd_v2_attrs(sd_v2, flags);
+ set_sd_v2_attrs(sd_v2, REISERFS_I(inode)->i_attrs);
}
/* used to copy inode's fields to old stat data */
@@ -2002,10 +1999,6 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
/* uid and gid must already be set by the caller for quota init */
- /* symlink cannot be immutable or append only, right? */
- if (S_ISLNK(inode->i_mode))
- inode->i_flags &= ~(S_IMMUTABLE | S_APPEND);
-
inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
inode->i_size = i_size;
inode->i_blocks = 0;
@@ -3095,28 +3088,6 @@ void sd_attrs_to_i_attrs(__u16 sd_attrs, struct inode *inode)
}
}
-void i_attrs_to_sd_attrs(struct inode *inode, __u16 * sd_attrs)
-{
- if (reiserfs_attrs(inode->i_sb)) {
- if (inode->i_flags & S_IMMUTABLE)
- *sd_attrs |= REISERFS_IMMUTABLE_FL;
- else
- *sd_attrs &= ~REISERFS_IMMUTABLE_FL;
- if (inode->i_flags & S_SYNC)
- *sd_attrs |= REISERFS_SYNC_FL;
- else
- *sd_attrs &= ~REISERFS_SYNC_FL;
- if (inode->i_flags & S_NOATIME)
- *sd_attrs |= REISERFS_NOATIME_FL;
- else
- *sd_attrs &= ~REISERFS_NOATIME_FL;
- if (REISERFS_I(inode)->i_flags & i_nopack_mask)
- *sd_attrs |= REISERFS_NOTAIL_FL;
- else
- *sd_attrs &= ~REISERFS_NOTAIL_FL;
- }
-}
-
/*
* decide if this buffer needs to stay around for data logging or ordered
* write purposes
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
index 1f4692a505a0..acbbaf7a0bb2 100644
--- a/fs/reiserfs/ioctl.c
+++ b/fs/reiserfs/ioctl.c
@@ -47,7 +47,6 @@ long reiserfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
}
flags = REISERFS_I(inode)->i_attrs;
- i_attrs_to_sd_attrs(inode, (__u16 *) & flags);
err = put_user(flags, (int __user *)arg);
break;
case REISERFS_IOC_SETFLAGS:{
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index aa40c242f1db..da01f497180a 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -1961,7 +1961,7 @@ static int do_journal_release(struct reiserfs_transaction_handle *th,
* will be requeued because superblock is being shutdown and doesn't
* have MS_ACTIVE set.
*/
- cancel_delayed_work_sync(&REISERFS_SB(sb)->old_work);
+ reiserfs_cancel_old_flush(sb);
/* wait for all commits to finish */
cancel_delayed_work_sync(&SB_JOURNAL(sb)->j_work);
diff --git a/fs/reiserfs/lbalance.c b/fs/reiserfs/lbalance.c
index 249594a821e0..f5cebd70d903 100644
--- a/fs/reiserfs/lbalance.c
+++ b/fs/reiserfs/lbalance.c
@@ -475,7 +475,7 @@ static void leaf_item_bottle(struct buffer_info *dest_bi,
* 'cpy_bytes'; create new item header;
* n_ih = new item_header;
*/
- memcpy(&n_ih, ih, SHORT_KEY_SIZE);
+ memcpy(&n_ih.ih_key, &ih->ih_key, KEY_SIZE);
/* Endian safe, both le */
n_ih.ih_version = ih->ih_version;
diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h
index 2adcde137c3f..1d34377fef97 100644
--- a/fs/reiserfs/reiserfs.h
+++ b/fs/reiserfs/reiserfs.h
@@ -1326,7 +1326,6 @@ struct cpu_key {
#define KEY_NOT_FOUND 0
#define KEY_SIZE (sizeof(struct reiserfs_key))
-#define SHORT_KEY_SIZE (sizeof (__u32) + sizeof (__u32))
/* return values for search_by_key and clones */
#define ITEM_FOUND 1
@@ -2949,6 +2948,7 @@ int reiserfs_allocate_list_bitmaps(struct super_block *s,
struct reiserfs_list_bitmap *, unsigned int);
void reiserfs_schedule_old_flush(struct super_block *s);
+void reiserfs_cancel_old_flush(struct super_block *s);
void add_save_link(struct reiserfs_transaction_handle *th,
struct inode *inode, int truncate);
int remove_save_link(struct inode *inode, int truncate);
@@ -3099,7 +3099,6 @@ static inline void reiserfs_update_sd(struct reiserfs_transaction_handle *th,
}
void sd_attrs_to_i_attrs(__u16 sd_attrs, struct inode *inode);
-void i_attrs_to_sd_attrs(struct inode *inode, __u16 * sd_attrs);
int reiserfs_setattr(struct dentry *dentry, struct iattr *attr);
int __reiserfs_write_begin(struct page *page, unsigned from, unsigned len);
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index feabcde0290d..685f1e056998 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -89,11 +89,27 @@ static void flush_old_commits(struct work_struct *work)
sbi = container_of(work, struct reiserfs_sb_info, old_work.work);
s = sbi->s_journal->j_work_sb;
+ /*
+ * We need s_umount for protecting quota writeback. We have to use
+ * trylock as reiserfs_cancel_old_flush() may be waiting for this work
+ * to complete with s_umount held.
+ */
+ if (!down_read_trylock(&s->s_umount)) {
+ /* Requeue work if we are not cancelling it */
+ spin_lock(&sbi->old_work_lock);
+ if (sbi->work_queued == 1)
+ queue_delayed_work(system_long_wq, &sbi->old_work, HZ);
+ spin_unlock(&sbi->old_work_lock);
+ return;
+ }
spin_lock(&sbi->old_work_lock);
- sbi->work_queued = 0;
+ /* Avoid clobbering the cancel state... */
+ if (sbi->work_queued == 1)
+ sbi->work_queued = 0;
spin_unlock(&sbi->old_work_lock);
reiserfs_sync_fs(s, 1);
+ up_read(&s->s_umount);
}
void reiserfs_schedule_old_flush(struct super_block *s)
@@ -117,21 +133,22 @@ void reiserfs_schedule_old_flush(struct super_block *s)
spin_unlock(&sbi->old_work_lock);
}
-static void cancel_old_flush(struct super_block *s)
+void reiserfs_cancel_old_flush(struct super_block *s)
{
struct reiserfs_sb_info *sbi = REISERFS_SB(s);
- cancel_delayed_work_sync(&REISERFS_SB(s)->old_work);
spin_lock(&sbi->old_work_lock);
- sbi->work_queued = 0;
+ /* Make sure no new flushes will be queued */
+ sbi->work_queued = 2;
spin_unlock(&sbi->old_work_lock);
+ cancel_delayed_work_sync(&REISERFS_SB(s)->old_work);
}
static int reiserfs_freeze(struct super_block *s)
{
struct reiserfs_transaction_handle th;
- cancel_old_flush(s);
+ reiserfs_cancel_old_flush(s);
reiserfs_write_lock(s);
if (!(s->s_flags & MS_RDONLY)) {
@@ -152,7 +169,13 @@ static int reiserfs_freeze(struct super_block *s)
static int reiserfs_unfreeze(struct super_block *s)
{
+ struct reiserfs_sb_info *sbi = REISERFS_SB(s);
+
reiserfs_allow_writes(s);
+ spin_lock(&sbi->old_work_lock);
+ /* Allow old_work to run again */
+ sbi->work_queued = 0;
+ spin_unlock(&sbi->old_work_lock);
return 0;
}
@@ -547,12 +570,28 @@ static void reiserfs_kill_sb(struct super_block *s)
kill_block_super(s);
}
+#ifdef CONFIG_QUOTA
+static int reiserfs_quota_off(struct super_block *sb, int type);
+
+static void reiserfs_quota_off_umount(struct super_block *s)
+{
+ int type;
+
+ for (type = 0; type < REISERFS_MAXQUOTAS; type++)
+ reiserfs_quota_off(s, type);
+}
+#else
+static inline void reiserfs_quota_off_umount(struct super_block *s)
+{
+}
+#endif
+
static void reiserfs_put_super(struct super_block *s)
{
struct reiserfs_transaction_handle th;
th.t_trans_id = 0;
- dquot_disable(s, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
+ reiserfs_quota_off_umount(s);
reiserfs_write_lock(s);
@@ -817,7 +856,7 @@ static const struct dquot_operations reiserfs_quota_operations = {
static const struct quotactl_ops reiserfs_qctl_operations = {
.quota_on = reiserfs_quota_on,
- .quota_off = dquot_quota_off,
+ .quota_off = reiserfs_quota_off,
.quota_sync = dquot_quota_sync,
.get_state = dquot_get_state,
.set_info = dquot_set_dqinfo,
@@ -2194,7 +2233,7 @@ error_unlocked:
if (sbi->commit_wq)
destroy_workqueue(sbi->commit_wq);
- cancel_delayed_work_sync(&REISERFS_SB(s)->old_work);
+ reiserfs_cancel_old_flush(s);
reiserfs_free_bitmap_cache(s);
if (SB_BUFFER_WITH_SB(s))
@@ -2405,12 +2444,47 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
goto out;
}
reiserfs_write_unlock(sb);
- return dquot_quota_on(sb, type, format_id, path);
+ err = dquot_quota_on(sb, type, format_id, path);
+ if (!err) {
+ inode_lock(inode);
+ REISERFS_I(inode)->i_attrs |= REISERFS_IMMUTABLE_FL |
+ REISERFS_NOATIME_FL;
+ inode_set_flags(inode, S_IMMUTABLE | S_NOATIME,
+ S_IMMUTABLE | S_NOATIME);
+ inode_unlock(inode);
+ mark_inode_dirty(inode);
+ }
+ return err;
out:
reiserfs_write_unlock(sb);
return err;
}
+static int reiserfs_quota_off(struct super_block *sb, int type)
+{
+ int err;
+ struct inode *inode = sb_dqopt(sb)->files[type];
+
+ if (!inode || !igrab(inode))
+ goto out;
+
+ err = dquot_quota_off(sb, type);
+ if (err)
+ goto out_put;
+
+ inode_lock(inode);
+ REISERFS_I(inode)->i_attrs &= ~(REISERFS_IMMUTABLE_FL |
+ REISERFS_NOATIME_FL);
+ inode_set_flags(inode, 0, S_IMMUTABLE | S_NOATIME);
+ inode_unlock(inode);
+ mark_inode_dirty(inode);
+out_put:
+ iput(inode);
+ return err;
+out:
+ return dquot_quota_off(sb, type);
+}
+
/*
* Read data from quotafile - avoid pagecache and such because we cannot afford
* acquiring the locks... As quota files are never truncated and quota code
diff --git a/fs/select.c b/fs/select.c
index e2112270d75a..bd4b2ccfd346 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -338,6 +338,53 @@ sticky:
return ret;
}
+/*
+ * Scalable version of the fd_set.
+ */
+
+typedef struct {
+ unsigned long *in, *out, *ex;
+ unsigned long *res_in, *res_out, *res_ex;
+} fd_set_bits;
+
+/*
+ * How many longwords for "nr" bits?
+ */
+#define FDS_BITPERLONG (8*sizeof(long))
+#define FDS_LONGS(nr) (((nr)+FDS_BITPERLONG-1)/FDS_BITPERLONG)
+#define FDS_BYTES(nr) (FDS_LONGS(nr)*sizeof(long))
+
+/*
+ * We do a VERIFY_WRITE here even though we are only reading this time:
+ * we'll write to it eventually..
+ *
+ * Use "unsigned long" accesses to let user-mode fd_set's be long-aligned.
+ */
+static inline
+int get_fd_set(unsigned long nr, void __user *ufdset, unsigned long *fdset)
+{
+ nr = FDS_BYTES(nr);
+ if (ufdset)
+ return copy_from_user(fdset, ufdset, nr) ? -EFAULT : 0;
+
+ memset(fdset, 0, nr);
+ return 0;
+}
+
+static inline unsigned long __must_check
+set_fd_set(unsigned long nr, void __user *ufdset, unsigned long *fdset)
+{
+ if (ufdset)
+ return __copy_to_user(ufdset, fdset, FDS_BYTES(nr));
+ return 0;
+}
+
+static inline
+void zero_fd_set(unsigned long nr, unsigned long *fdset)
+{
+ memset(fdset, 0, FDS_BYTES(nr));
+}
+
#define FDS_IN(fds, n) (fds->in + n)
#define FDS_OUT(fds, n) (fds->out + n)
#define FDS_EX(fds, n) (fds->ex + n)
@@ -401,7 +448,7 @@ static inline void wait_key_set(poll_table *wait, unsigned long in,
wait->_key |= POLLOUT_SET;
}
-int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time)
+static int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time)
{
ktime_t expire, *to = NULL;
struct poll_wqueues table;
@@ -409,7 +456,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time)
int retval, i, timed_out = 0;
u64 slack = 0;
unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0;
- unsigned long busy_end = 0;
+ unsigned long busy_start = 0;
rcu_read_lock();
retval = max_select_fd(n, fds);
@@ -512,11 +559,11 @@ int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time)
/* only if found POLL_BUSY_LOOP sockets && not out of time */
if (can_busy_loop && !need_resched()) {
- if (!busy_end) {
- busy_end = busy_loop_end_time();
+ if (!busy_start) {
+ busy_start = busy_loop_current_time();
continue;
}
- if (!busy_loop_timeout(busy_end))
+ if (!busy_loop_timeout(busy_start))
continue;
}
busy_flag = 0;
@@ -800,7 +847,7 @@ static int do_poll(struct poll_list *list, struct poll_wqueues *wait,
int timed_out = 0, count = 0;
u64 slack = 0;
unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0;
- unsigned long busy_end = 0;
+ unsigned long busy_start = 0;
/* Optimise the no-wait case */
if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
@@ -853,11 +900,11 @@ static int do_poll(struct poll_list *list, struct poll_wqueues *wait,
/* only if found POLL_BUSY_LOOP sockets && not out of time */
if (can_busy_loop && !need_resched()) {
- if (!busy_end) {
- busy_end = busy_loop_end_time();
+ if (!busy_start) {
+ busy_start = busy_loop_current_time();
continue;
}
- if (!busy_loop_timeout(busy_end))
+ if (!busy_loop_timeout(busy_start))
continue;
}
busy_flag = 0;
@@ -881,7 +928,7 @@ static int do_poll(struct poll_list *list, struct poll_wqueues *wait,
#define N_STACK_PPS ((sizeof(stack_pps) - sizeof(struct poll_list)) / \
sizeof(struct pollfd))
-int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
+static int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
struct timespec64 *end_time)
{
struct poll_wqueues table;
@@ -1053,3 +1100,373 @@ SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds,
return ret;
}
+
+#ifdef CONFIG_COMPAT
+#define __COMPAT_NFDBITS (8 * sizeof(compat_ulong_t))
+
+static
+int compat_poll_select_copy_remaining(struct timespec *end_time, void __user *p,
+ int timeval, int ret)
+{
+ struct timespec ts;
+
+ if (!p)
+ return ret;
+
+ if (current->personality & STICKY_TIMEOUTS)
+ goto sticky;
+
+ /* No update for zero timeout */
+ if (!end_time->tv_sec && !end_time->tv_nsec)
+ return ret;
+
+ ktime_get_ts(&ts);
+ ts = timespec_sub(*end_time, ts);
+ if (ts.tv_sec < 0)
+ ts.tv_sec = ts.tv_nsec = 0;
+
+ if (timeval) {
+ struct compat_timeval rtv;
+
+ rtv.tv_sec = ts.tv_sec;
+ rtv.tv_usec = ts.tv_nsec / NSEC_PER_USEC;
+
+ if (!copy_to_user(p, &rtv, sizeof(rtv)))
+ return ret;
+ } else {
+ struct compat_timespec rts;
+
+ rts.tv_sec = ts.tv_sec;
+ rts.tv_nsec = ts.tv_nsec;
+
+ if (!copy_to_user(p, &rts, sizeof(rts)))
+ return ret;
+ }
+ /*
+ * If an application puts its timeval in read-only memory, we
+ * don't want the Linux-specific update to the timeval to
+ * cause a fault after the select has completed
+ * successfully. However, because we're not updating the
+ * timeval, we can't restart the system call.
+ */
+
+sticky:
+ if (ret == -ERESTARTNOHAND)
+ ret = -EINTR;
+ return ret;
+}
+
+/*
+ * Ooo, nasty. We need here to frob 32-bit unsigned longs to
+ * 64-bit unsigned longs.
+ */
+static
+int compat_get_fd_set(unsigned long nr, compat_ulong_t __user *ufdset,
+ unsigned long *fdset)
+{
+ nr = DIV_ROUND_UP(nr, __COMPAT_NFDBITS);
+ if (ufdset) {
+ unsigned long odd;
+
+ if (!access_ok(VERIFY_WRITE, ufdset, nr*sizeof(compat_ulong_t)))
+ return -EFAULT;
+
+ odd = nr & 1UL;
+ nr &= ~1UL;
+ while (nr) {
+ unsigned long h, l;
+ if (__get_user(l, ufdset) || __get_user(h, ufdset+1))
+ return -EFAULT;
+ ufdset += 2;
+ *fdset++ = h << 32 | l;
+ nr -= 2;
+ }
+ if (odd && __get_user(*fdset, ufdset))
+ return -EFAULT;
+ } else {
+ /* Tricky, must clear full unsigned long in the
+ * kernel fdset at the end, this makes sure that
+ * actually happens.
+ */
+ memset(fdset, 0, ((nr + 1) & ~1)*sizeof(compat_ulong_t));
+ }
+ return 0;
+}
+
+static
+int compat_set_fd_set(unsigned long nr, compat_ulong_t __user *ufdset,
+ unsigned long *fdset)
+{
+ unsigned long odd;
+ nr = DIV_ROUND_UP(nr, __COMPAT_NFDBITS);
+
+ if (!ufdset)
+ return 0;
+
+ odd = nr & 1UL;
+ nr &= ~1UL;
+ while (nr) {
+ unsigned long h, l;
+ l = *fdset++;
+ h = l >> 32;
+ if (__put_user(l, ufdset) || __put_user(h, ufdset+1))
+ return -EFAULT;
+ ufdset += 2;
+ nr -= 2;
+ }
+ if (odd && __put_user(*fdset, ufdset))
+ return -EFAULT;
+ return 0;
+}
+
+
+/*
+ * This is a virtual copy of sys_select from fs/select.c and probably
+ * should be compared to it from time to time
+ */
+
+/*
+ * We can actually return ERESTARTSYS instead of EINTR, but I'd
+ * like to be certain this leads to no problems. So I return
+ * EINTR just for safety.
+ *
+ * Update: ERESTARTSYS breaks at least the xview clock binary, so
+ * I'm trying ERESTARTNOHAND which restart only when you want to.
+ */
+static int compat_core_sys_select(int n, compat_ulong_t __user *inp,
+ compat_ulong_t __user *outp, compat_ulong_t __user *exp,
+ struct timespec *end_time)
+{
+ fd_set_bits fds;
+ void *bits;
+ int size, max_fds, ret = -EINVAL;
+ struct fdtable *fdt;
+ long stack_fds[SELECT_STACK_ALLOC/sizeof(long)];
+
+ if (n < 0)
+ goto out_nofds;
+
+ /* max_fds can increase, so grab it once to avoid race */
+ rcu_read_lock();
+ fdt = files_fdtable(current->files);
+ max_fds = fdt->max_fds;
+ rcu_read_unlock();
+ if (n > max_fds)
+ n = max_fds;
+
+ /*
+ * We need 6 bitmaps (in/out/ex for both incoming and outgoing),
+ * since we used fdset we need to allocate memory in units of
+ * long-words.
+ */
+ size = FDS_BYTES(n);
+ bits = stack_fds;
+ if (size > sizeof(stack_fds) / 6) {
+ bits = kmalloc(6 * size, GFP_KERNEL);
+ ret = -ENOMEM;
+ if (!bits)
+ goto out_nofds;
+ }
+ fds.in = (unsigned long *) bits;
+ fds.out = (unsigned long *) (bits + size);
+ fds.ex = (unsigned long *) (bits + 2*size);
+ fds.res_in = (unsigned long *) (bits + 3*size);
+ fds.res_out = (unsigned long *) (bits + 4*size);
+ fds.res_ex = (unsigned long *) (bits + 5*size);
+
+ if ((ret = compat_get_fd_set(n, inp, fds.in)) ||
+ (ret = compat_get_fd_set(n, outp, fds.out)) ||
+ (ret = compat_get_fd_set(n, exp, fds.ex)))
+ goto out;
+ zero_fd_set(n, fds.res_in);
+ zero_fd_set(n, fds.res_out);
+ zero_fd_set(n, fds.res_ex);
+
+ ret = do_select(n, &fds, end_time);
+
+ if (ret < 0)
+ goto out;
+ if (!ret) {
+ ret = -ERESTARTNOHAND;
+ if (signal_pending(current))
+ goto out;
+ ret = 0;
+ }
+
+ if (compat_set_fd_set(n, inp, fds.res_in) ||
+ compat_set_fd_set(n, outp, fds.res_out) ||
+ compat_set_fd_set(n, exp, fds.res_ex))
+ ret = -EFAULT;
+out:
+ if (bits != stack_fds)
+ kfree(bits);
+out_nofds:
+ return ret;
+}
+
+COMPAT_SYSCALL_DEFINE5(select, int, n, compat_ulong_t __user *, inp,
+ compat_ulong_t __user *, outp, compat_ulong_t __user *, exp,
+ struct compat_timeval __user *, tvp)
+{
+ struct timespec end_time, *to = NULL;
+ struct compat_timeval tv;
+ int ret;
+
+ if (tvp) {
+ if (copy_from_user(&tv, tvp, sizeof(tv)))
+ return -EFAULT;
+
+ to = &end_time;
+ if (poll_select_set_timeout(to,
+ tv.tv_sec + (tv.tv_usec / USEC_PER_SEC),
+ (tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC))
+ return -EINVAL;
+ }
+
+ ret = compat_core_sys_select(n, inp, outp, exp, to);
+ ret = compat_poll_select_copy_remaining(&end_time, tvp, 1, ret);
+
+ return ret;
+}
+
+struct compat_sel_arg_struct {
+ compat_ulong_t n;
+ compat_uptr_t inp;
+ compat_uptr_t outp;
+ compat_uptr_t exp;
+ compat_uptr_t tvp;
+};
+
+COMPAT_SYSCALL_DEFINE1(old_select, struct compat_sel_arg_struct __user *, arg)
+{
+ struct compat_sel_arg_struct a;
+
+ if (copy_from_user(&a, arg, sizeof(a)))
+ return -EFAULT;
+ return compat_sys_select(a.n, compat_ptr(a.inp), compat_ptr(a.outp),
+ compat_ptr(a.exp), compat_ptr(a.tvp));
+}
+
+static long do_compat_pselect(int n, compat_ulong_t __user *inp,
+ compat_ulong_t __user *outp, compat_ulong_t __user *exp,
+ struct compat_timespec __user *tsp, compat_sigset_t __user *sigmask,
+ compat_size_t sigsetsize)
+{
+ compat_sigset_t ss32;
+ sigset_t ksigmask, sigsaved;
+ struct compat_timespec ts;
+ struct timespec end_time, *to = NULL;
+ int ret;
+
+ if (tsp) {
+ if (copy_from_user(&ts, tsp, sizeof(ts)))
+ return -EFAULT;
+
+ to = &end_time;
+ if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
+ return -EINVAL;
+ }
+
+ if (sigmask) {
+ if (sigsetsize != sizeof(compat_sigset_t))
+ return -EINVAL;
+ if (copy_from_user(&ss32, sigmask, sizeof(ss32)))
+ return -EFAULT;
+ sigset_from_compat(&ksigmask, &ss32);
+
+ sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
+ sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
+ }
+
+ ret = compat_core_sys_select(n, inp, outp, exp, to);
+ ret = compat_poll_select_copy_remaining(&end_time, tsp, 0, ret);
+
+ if (ret == -ERESTARTNOHAND) {
+ /*
+ * Don't restore the signal mask yet. Let do_signal() deliver
+ * the signal on the way back to userspace, before the signal
+ * mask is restored.
+ */
+ if (sigmask) {
+ memcpy(&current->saved_sigmask, &sigsaved,
+ sizeof(sigsaved));
+ set_restore_sigmask();
+ }
+ } else if (sigmask)
+ sigprocmask(SIG_SETMASK, &sigsaved, NULL);
+
+ return ret;
+}
+
+COMPAT_SYSCALL_DEFINE6(pselect6, int, n, compat_ulong_t __user *, inp,
+ compat_ulong_t __user *, outp, compat_ulong_t __user *, exp,
+ struct compat_timespec __user *, tsp, void __user *, sig)
+{
+ compat_size_t sigsetsize = 0;
+ compat_uptr_t up = 0;
+
+ if (sig) {
+ if (!access_ok(VERIFY_READ, sig,
+ sizeof(compat_uptr_t)+sizeof(compat_size_t)) ||
+ __get_user(up, (compat_uptr_t __user *)sig) ||
+ __get_user(sigsetsize,
+ (compat_size_t __user *)(sig+sizeof(up))))
+ return -EFAULT;
+ }
+ return do_compat_pselect(n, inp, outp, exp, tsp, compat_ptr(up),
+ sigsetsize);
+}
+
+COMPAT_SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds,
+ unsigned int, nfds, struct compat_timespec __user *, tsp,
+ const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize)
+{
+ compat_sigset_t ss32;
+ sigset_t ksigmask, sigsaved;
+ struct compat_timespec ts;
+ struct timespec end_time, *to = NULL;
+ int ret;
+
+ if (tsp) {
+ if (copy_from_user(&ts, tsp, sizeof(ts)))
+ return -EFAULT;
+
+ to = &end_time;
+ if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
+ return -EINVAL;
+ }
+
+ if (sigmask) {
+ if (sigsetsize != sizeof(compat_sigset_t))
+ return -EINVAL;
+ if (copy_from_user(&ss32, sigmask, sizeof(ss32)))
+ return -EFAULT;
+ sigset_from_compat(&ksigmask, &ss32);
+
+ sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
+ sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
+ }
+
+ ret = do_sys_poll(ufds, nfds, to);
+
+ /* We can restart this syscall, usually */
+ if (ret == -EINTR) {
+ /*
+ * Don't restore the signal mask yet. Let do_signal() deliver
+ * the signal on the way back to userspace, before the signal
+ * mask is restored.
+ */
+ if (sigmask) {
+ memcpy(&current->saved_sigmask, &sigsaved,
+ sizeof(sigsaved));
+ set_restore_sigmask();
+ }
+ ret = -ERESTARTNOHAND;
+ } else if (sigmask)
+ sigprocmask(SIG_SETMASK, &sigsaved, NULL);
+
+ ret = compat_poll_select_copy_remaining(&end_time, tsp, 0, ret);
+
+ return ret;
+}
+#endif
diff --git a/fs/splice.c b/fs/splice.c
index 006ba50f4ece..540c4a44756c 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -247,11 +247,6 @@ ssize_t add_to_pipe(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
}
EXPORT_SYMBOL(add_to_pipe);
-void spd_release_page(struct splice_pipe_desc *spd, unsigned int i)
-{
- put_page(spd->pages[i]);
-}
-
/*
* Check if we need to grow the arrays holding pages and partial page
* descriptions.
@@ -393,7 +388,7 @@ static ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
struct iov_iter to;
struct page **pages;
unsigned int nr_pages;
- size_t offset, dummy, copied = 0;
+ size_t offset, base, copied = 0;
ssize_t res;
int i;
@@ -408,12 +403,11 @@ static ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
iov_iter_pipe(&to, ITER_PIPE | READ, pipe, len + offset);
- res = iov_iter_get_pages_alloc(&to, &pages, len + offset, &dummy);
+ res = iov_iter_get_pages_alloc(&to, &pages, len + offset, &base);
if (res <= 0)
return -ENOMEM;
- BUG_ON(dummy);
- nr_pages = DIV_ROUND_UP(res, PAGE_SIZE);
+ nr_pages = DIV_ROUND_UP(res + base, PAGE_SIZE);
vec = __vec;
if (nr_pages > PIPE_DEF_BUFFERS) {
@@ -1359,6 +1353,8 @@ SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, iov,
struct fd f;
long error;
+ if (unlikely(flags & ~SPLICE_F_ALL))
+ return -EINVAL;
if (unlikely(nr_segs > UIO_MAXIOV))
return -EINVAL;
else if (unlikely(!nr_segs))
@@ -1409,6 +1405,9 @@ SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in,
if (unlikely(!len))
return 0;
+ if (unlikely(flags & ~SPLICE_F_ALL))
+ return -EINVAL;
+
error = -EBADF;
in = fdget(fd_in);
if (in.file) {
@@ -1737,6 +1736,9 @@ SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags)
struct fd in;
int error;
+ if (unlikely(flags & ~SPLICE_F_ALL))
+ return -EINVAL;
+
if (unlikely(!len))
return 0;
diff --git a/fs/stat.c b/fs/stat.c
index c6c963b2546b..f494b182c7c7 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -15,6 +15,7 @@
#include <linux/cred.h>
#include <linux/syscalls.h>
#include <linux/pagemap.h>
+#include <linux/compat.h>
#include <linux/uaccess.h>
#include <asm/unistd.h>
@@ -547,13 +548,13 @@ cp_statx(const struct kstat *stat, struct statx __user *buffer)
/**
* sys_statx - System call to get enhanced stats
* @dfd: Base directory to pathwalk from *or* fd to stat.
- * @filename: File to stat *or* NULL.
+ * @filename: File to stat or "" with AT_EMPTY_PATH
* @flags: AT_* flags to control pathwalk.
* @mask: Parts of statx struct actually required.
* @buffer: Result buffer.
*
- * Note that if filename is NULL, then it does the equivalent of fstat() using
- * dfd to indicate the file of interest.
+ * Note that fstat() can be emulated by setting dfd to the fd of interest,
+ * supplying "" as the filename and setting AT_EMPTY_PATH in the flags.
*/
SYSCALL_DEFINE5(statx,
int, dfd, const char __user *, filename, unsigned, flags,
@@ -568,16 +569,98 @@ SYSCALL_DEFINE5(statx,
if ((flags & AT_STATX_SYNC_TYPE) == AT_STATX_SYNC_TYPE)
return -EINVAL;
- if (filename)
- error = vfs_statx(dfd, filename, flags, &stat, mask);
- else
- error = vfs_statx_fd(dfd, &stat, mask, flags);
+ error = vfs_statx(dfd, filename, flags, &stat, mask);
if (error)
return error;
return cp_statx(&stat, buffer);
}
+#ifdef CONFIG_COMPAT
+static int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf)
+{
+ struct compat_stat tmp;
+
+ if (!old_valid_dev(stat->dev) || !old_valid_dev(stat->rdev))
+ return -EOVERFLOW;
+
+ memset(&tmp, 0, sizeof(tmp));
+ tmp.st_dev = old_encode_dev(stat->dev);
+ tmp.st_ino = stat->ino;
+ if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino)
+ return -EOVERFLOW;
+ tmp.st_mode = stat->mode;
+ tmp.st_nlink = stat->nlink;
+ if (tmp.st_nlink != stat->nlink)
+ return -EOVERFLOW;
+ SET_UID(tmp.st_uid, from_kuid_munged(current_user_ns(), stat->uid));
+ SET_GID(tmp.st_gid, from_kgid_munged(current_user_ns(), stat->gid));
+ tmp.st_rdev = old_encode_dev(stat->rdev);
+ if ((u64) stat->size > MAX_NON_LFS)
+ return -EOVERFLOW;
+ tmp.st_size = stat->size;
+ tmp.st_atime = stat->atime.tv_sec;
+ tmp.st_atime_nsec = stat->atime.tv_nsec;
+ tmp.st_mtime = stat->mtime.tv_sec;
+ tmp.st_mtime_nsec = stat->mtime.tv_nsec;
+ tmp.st_ctime = stat->ctime.tv_sec;
+ tmp.st_ctime_nsec = stat->ctime.tv_nsec;
+ tmp.st_blocks = stat->blocks;
+ tmp.st_blksize = stat->blksize;
+ return copy_to_user(ubuf, &tmp, sizeof(tmp)) ? -EFAULT : 0;
+}
+
+COMPAT_SYSCALL_DEFINE2(newstat, const char __user *, filename,
+ struct compat_stat __user *, statbuf)
+{
+ struct kstat stat;
+ int error;
+
+ error = vfs_stat(filename, &stat);
+ if (error)
+ return error;
+ return cp_compat_stat(&stat, statbuf);
+}
+
+COMPAT_SYSCALL_DEFINE2(newlstat, const char __user *, filename,
+ struct compat_stat __user *, statbuf)
+{
+ struct kstat stat;
+ int error;
+
+ error = vfs_lstat(filename, &stat);
+ if (error)
+ return error;
+ return cp_compat_stat(&stat, statbuf);
+}
+
+#ifndef __ARCH_WANT_STAT64
+COMPAT_SYSCALL_DEFINE4(newfstatat, unsigned int, dfd,
+ const char __user *, filename,
+ struct compat_stat __user *, statbuf, int, flag)
+{
+ struct kstat stat;
+ int error;
+
+ error = vfs_fstatat(dfd, filename, &stat, flag);
+ if (error)
+ return error;
+ return cp_compat_stat(&stat, statbuf);
+}
+#endif
+
+COMPAT_SYSCALL_DEFINE2(newfstat, unsigned int, fd,
+ struct compat_stat __user *, statbuf)
+{
+ struct kstat stat;
+ int error = vfs_fstat(fd, &stat);
+
+ if (!error)
+ error = cp_compat_stat(&stat, statbuf);
+ return error;
+}
+#endif
+
/* Caller is here responsible for sufficient locking (ie. inode->i_lock) */
void __inode_add_bytes(struct inode *inode, loff_t bytes)
{
diff --git a/fs/statfs.c b/fs/statfs.c
index 13ae259d4879..4e4623c7a126 100644
--- a/fs/statfs.c
+++ b/fs/statfs.c
@@ -7,6 +7,7 @@
#include <linux/statfs.h>
#include <linux/security.h>
#include <linux/uaccess.h>
+#include <linux/compat.h>
#include "internal.h"
static int flags_by_mnt(int mnt_flags)
@@ -239,3 +240,142 @@ SYSCALL_DEFINE2(ustat, unsigned, dev, struct ustat __user *, ubuf)
return copy_to_user(ubuf, &tmp, sizeof(struct ustat)) ? -EFAULT : 0;
}
+
+#ifdef CONFIG_COMPAT
+static int put_compat_statfs(struct compat_statfs __user *ubuf, struct kstatfs *kbuf)
+{
+ if (sizeof ubuf->f_blocks == 4) {
+ if ((kbuf->f_blocks | kbuf->f_bfree | kbuf->f_bavail |
+ kbuf->f_bsize | kbuf->f_frsize) & 0xffffffff00000000ULL)
+ return -EOVERFLOW;
+ /* f_files and f_ffree may be -1; it's okay
+ * to stuff that into 32 bits */
+ if (kbuf->f_files != 0xffffffffffffffffULL
+ && (kbuf->f_files & 0xffffffff00000000ULL))
+ return -EOVERFLOW;
+ if (kbuf->f_ffree != 0xffffffffffffffffULL
+ && (kbuf->f_ffree & 0xffffffff00000000ULL))
+ return -EOVERFLOW;
+ }
+ if (!access_ok(VERIFY_WRITE, ubuf, sizeof(*ubuf)) ||
+ __put_user(kbuf->f_type, &ubuf->f_type) ||
+ __put_user(kbuf->f_bsize, &ubuf->f_bsize) ||
+ __put_user(kbuf->f_blocks, &ubuf->f_blocks) ||
+ __put_user(kbuf->f_bfree, &ubuf->f_bfree) ||
+ __put_user(kbuf->f_bavail, &ubuf->f_bavail) ||
+ __put_user(kbuf->f_files, &ubuf->f_files) ||
+ __put_user(kbuf->f_ffree, &ubuf->f_ffree) ||
+ __put_user(kbuf->f_namelen, &ubuf->f_namelen) ||
+ __put_user(kbuf->f_fsid.val[0], &ubuf->f_fsid.val[0]) ||
+ __put_user(kbuf->f_fsid.val[1], &ubuf->f_fsid.val[1]) ||
+ __put_user(kbuf->f_frsize, &ubuf->f_frsize) ||
+ __put_user(kbuf->f_flags, &ubuf->f_flags) ||
+ __clear_user(ubuf->f_spare, sizeof(ubuf->f_spare)))
+ return -EFAULT;
+ return 0;
+}
+
+/*
+ * The following statfs calls are copies of code from fs/statfs.c and
+ * should be checked against those from time to time
+ */
+COMPAT_SYSCALL_DEFINE2(statfs, const char __user *, pathname, struct compat_statfs __user *, buf)
+{
+ struct kstatfs tmp;
+ int error = user_statfs(pathname, &tmp);
+ if (!error)
+ error = put_compat_statfs(buf, &tmp);
+ return error;
+}
+
+COMPAT_SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct compat_statfs __user *, buf)
+{
+ struct kstatfs tmp;
+ int error = fd_statfs(fd, &tmp);
+ if (!error)
+ error = put_compat_statfs(buf, &tmp);
+ return error;
+}
+
+static int put_compat_statfs64(struct compat_statfs64 __user *ubuf, struct kstatfs *kbuf)
+{
+ if (sizeof(ubuf->f_bsize) == 4) {
+ if ((kbuf->f_type | kbuf->f_bsize | kbuf->f_namelen |
+ kbuf->f_frsize | kbuf->f_flags) & 0xffffffff00000000ULL)
+ return -EOVERFLOW;
+ /* f_files and f_ffree may be -1; it's okay
+ * to stuff that into 32 bits */
+ if (kbuf->f_files != 0xffffffffffffffffULL
+ && (kbuf->f_files & 0xffffffff00000000ULL))
+ return -EOVERFLOW;
+ if (kbuf->f_ffree != 0xffffffffffffffffULL
+ && (kbuf->f_ffree & 0xffffffff00000000ULL))
+ return -EOVERFLOW;
+ }
+ if (!access_ok(VERIFY_WRITE, ubuf, sizeof(*ubuf)) ||
+ __put_user(kbuf->f_type, &ubuf->f_type) ||
+ __put_user(kbuf->f_bsize, &ubuf->f_bsize) ||
+ __put_user(kbuf->f_blocks, &ubuf->f_blocks) ||
+ __put_user(kbuf->f_bfree, &ubuf->f_bfree) ||
+ __put_user(kbuf->f_bavail, &ubuf->f_bavail) ||
+ __put_user(kbuf->f_files, &ubuf->f_files) ||
+ __put_user(kbuf->f_ffree, &ubuf->f_ffree) ||
+ __put_user(kbuf->f_namelen, &ubuf->f_namelen) ||
+ __put_user(kbuf->f_fsid.val[0], &ubuf->f_fsid.val[0]) ||
+ __put_user(kbuf->f_fsid.val[1], &ubuf->f_fsid.val[1]) ||
+ __put_user(kbuf->f_frsize, &ubuf->f_frsize) ||
+ __put_user(kbuf->f_flags, &ubuf->f_flags) ||
+ __clear_user(ubuf->f_spare, sizeof(ubuf->f_spare)))
+ return -EFAULT;
+ return 0;
+}
+
+COMPAT_SYSCALL_DEFINE3(statfs64, const char __user *, pathname, compat_size_t, sz, struct compat_statfs64 __user *, buf)
+{
+ struct kstatfs tmp;
+ int error;
+
+ if (sz != sizeof(*buf))
+ return -EINVAL;
+
+ error = user_statfs(pathname, &tmp);
+ if (!error)
+ error = put_compat_statfs64(buf, &tmp);
+ return error;
+}
+
+COMPAT_SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, compat_size_t, sz, struct compat_statfs64 __user *, buf)
+{
+ struct kstatfs tmp;
+ int error;
+
+ if (sz != sizeof(*buf))
+ return -EINVAL;
+
+ error = fd_statfs(fd, &tmp);
+ if (!error)
+ error = put_compat_statfs64(buf, &tmp);
+ return error;
+}
+
+/*
+ * This is a copy of sys_ustat, just dealing with a structure layout.
+ * Given how simple this syscall is that apporach is more maintainable
+ * than the various conversion hacks.
+ */
+COMPAT_SYSCALL_DEFINE2(ustat, unsigned, dev, struct compat_ustat __user *, u)
+{
+ struct compat_ustat tmp;
+ struct kstatfs sbuf;
+ int err = vfs_ustat(new_decode_dev(dev), &sbuf);
+ if (err)
+ return err;
+
+ memset(&tmp, 0, sizeof(struct compat_ustat));
+ tmp.f_tfree = sbuf.f_bfree;
+ tmp.f_tinode = sbuf.f_ffree;
+ if (copy_to_user(u, &tmp, sizeof(struct compat_ustat)))
+ return -EFAULT;
+ return 0;
+}
+#endif
diff --git a/fs/super.c b/fs/super.c
index b8b6a086c03b..adb0c0de428c 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -446,6 +446,10 @@ void generic_shutdown_super(struct super_block *sb)
hlist_del_init(&sb->s_instances);
spin_unlock(&sb_lock);
up_write(&sb->s_umount);
+ if (sb->s_bdi != &noop_backing_dev_info) {
+ bdi_put(sb->s_bdi);
+ sb->s_bdi = &noop_backing_dev_info;
+ }
}
EXPORT_SYMBOL(generic_shutdown_super);
@@ -1049,12 +1053,8 @@ static int set_bdev_super(struct super_block *s, void *data)
{
s->s_bdev = data;
s->s_dev = s->s_bdev->bd_dev;
+ s->s_bdi = bdi_get(s->s_bdev->bd_bdi);
- /*
- * We set the bdi here to the queue backing, file systems can
- * overwrite this in ->fill_super()
- */
- s->s_bdi = bdev_get_queue(s->s_bdev)->backing_dev_info;
return 0;
}
@@ -1256,6 +1256,49 @@ out:
}
/*
+ * Setup private BDI for given superblock. It gets automatically cleaned up
+ * in generic_shutdown_super().
+ */
+int super_setup_bdi_name(struct super_block *sb, char *fmt, ...)
+{
+ struct backing_dev_info *bdi;
+ int err;
+ va_list args;
+
+ bdi = bdi_alloc(GFP_KERNEL);
+ if (!bdi)
+ return -ENOMEM;
+
+ bdi->name = sb->s_type->name;
+
+ va_start(args, fmt);
+ err = bdi_register_va(bdi, fmt, args);
+ va_end(args);
+ if (err) {
+ bdi_put(bdi);
+ return err;
+ }
+ WARN_ON(sb->s_bdi != &noop_backing_dev_info);
+ sb->s_bdi = bdi;
+
+ return 0;
+}
+EXPORT_SYMBOL(super_setup_bdi_name);
+
+/*
+ * Setup private BDI for given superblock. I gets automatically cleaned up
+ * in generic_shutdown_super().
+ */
+int super_setup_bdi(struct super_block *sb)
+{
+ static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
+
+ return super_setup_bdi_name(sb, "%.28s-%ld", sb->s_type->name,
+ atomic_long_inc_return(&bdi_seq));
+}
+EXPORT_SYMBOL(super_setup_bdi);
+
+/*
* This is an internal function, please use sb_end_{write,pagefault,intwrite}
* instead.
*/
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 1e712a364680..718b749fa11a 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -32,6 +32,7 @@
#include <linux/math64.h>
#include <linux/uaccess.h>
#include <linux/random.h>
+#include <linux/ctype.h>
#include "ubifs.h"
static DEFINE_SPINLOCK(dbg_lock);
@@ -286,8 +287,10 @@ void ubifs_dump_inode(struct ubifs_info *c, const struct inode *inode)
break;
}
- pr_err("\t%d: %s (%s)\n",
- count++, dent->name, get_dent_type(dent->type));
+ pr_err("\t%d: inode %llu, type %s, len %d\n",
+ count++, (unsigned long long) le64_to_cpu(dent->inum),
+ get_dent_type(dent->type),
+ le16_to_cpu(dent->nlen));
fname_name(&nm) = dent->name;
fname_len(&nm) = le16_to_cpu(dent->nlen);
@@ -464,7 +467,8 @@ void ubifs_dump_node(const struct ubifs_info *c, const void *node)
pr_err("(bad name length, not printing, bad or corrupted node)");
else {
for (i = 0; i < nlen && dent->name[i]; i++)
- pr_cont("%c", dent->name[i]);
+ pr_cont("%c", isprint(dent->name[i]) ?
+ dent->name[i] : '?');
}
pr_cont("\n");
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 30825d882aa9..b777bddaa1dd 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -606,8 +606,8 @@ static int ubifs_readdir(struct file *file, struct dir_context *ctx)
}
while (1) {
- dbg_gen("feed '%s', ino %llu, new f_pos %#x",
- dent->name, (unsigned long long)le64_to_cpu(dent->inum),
+ dbg_gen("ino %llu, new f_pos %#x",
+ (unsigned long long)le64_to_cpu(dent->inum),
key_hash_flash(c, &dent->key));
ubifs_assert(le64_to_cpu(dent->ch.sqnum) >
ubifs_inode(dir)->creat_sqnum);
@@ -748,6 +748,11 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
goto out_fname;
lock_2_inodes(dir, inode);
+
+ /* Handle O_TMPFILE corner case, it is allowed to link a O_TMPFILE. */
+ if (inode->i_nlink == 0)
+ ubifs_delete_orphan(c, inode->i_ino);
+
inc_nlink(inode);
ihold(inode);
inode->i_ctime = ubifs_current_time(inode);
@@ -768,6 +773,8 @@ out_cancel:
dir->i_size -= sz_change;
dir_ui->ui_size = dir->i_size;
drop_nlink(inode);
+ if (inode->i_nlink == 0)
+ ubifs_add_orphan(c, inode->i_ino);
unlock_2_inodes(dir, inode);
ubifs_release_budget(c, &req);
iput(inode);
@@ -1068,8 +1075,10 @@ static int ubifs_mknod(struct inode *dir, struct dentry *dentry,
}
err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &nm);
- if (err)
+ if (err) {
+ kfree(dev);
goto out_budg;
+ }
sz_change = CALC_DENT_SIZE(fname_len(&nm));
@@ -1316,9 +1325,6 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry,
unsigned int uninitialized_var(saved_nlink);
struct fscrypt_name old_nm, new_nm;
- if (flags & ~RENAME_NOREPLACE)
- return -EINVAL;
-
/*
* Budget request settings: deletion direntry, new direntry, removing
* the old inode, and changing old and new parent directory inodes.
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index b73811bd7676..cf4cc99b75b5 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -1827,7 +1827,6 @@ static void ubifs_put_super(struct super_block *sb)
}
ubifs_umount(c);
- bdi_destroy(&c->bdi);
ubi_close_volume(c->ubi);
mutex_unlock(&c->umount_mutex);
}
@@ -2019,29 +2018,25 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
goto out;
}
+ err = ubifs_parse_options(c, data, 0);
+ if (err)
+ goto out_close;
+
/*
* UBIFS provides 'backing_dev_info' in order to disable read-ahead. For
* UBIFS, I/O is not deferred, it is done immediately in readpage,
* which means the user would have to wait not just for their own I/O
* but the read-ahead I/O as well i.e. completely pointless.
*
- * Read-ahead will be disabled because @c->bdi.ra_pages is 0.
+ * Read-ahead will be disabled because @sb->s_bdi->ra_pages is 0. Also
+ * @sb->s_bdi->capabilities are initialized to 0 so there won't be any
+ * writeback happening.
*/
- c->bdi.name = "ubifs",
- c->bdi.capabilities = 0;
- err = bdi_init(&c->bdi);
+ err = super_setup_bdi_name(sb, "ubifs_%d_%d", c->vi.ubi_num,
+ c->vi.vol_id);
if (err)
goto out_close;
- err = bdi_register(&c->bdi, NULL, "ubifs_%d_%d",
- c->vi.ubi_num, c->vi.vol_id);
- if (err)
- goto out_bdi;
-
- err = ubifs_parse_options(c, data, 0);
- if (err)
- goto out_bdi;
- sb->s_bdi = &c->bdi;
sb->s_fs_info = c;
sb->s_magic = UBIFS_SUPER_MAGIC;
sb->s_blocksize = UBIFS_BLOCK_SIZE;
@@ -2080,8 +2075,6 @@ out_umount:
ubifs_umount(c);
out_unlock:
mutex_unlock(&c->umount_mutex);
-out_bdi:
- bdi_destroy(&c->bdi);
out_close:
ubi_close_volume(c->ubi);
out:
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 4d57e488038e..4da10a6d702a 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -972,7 +972,6 @@ struct ubifs_debug_info;
* struct ubifs_info - UBIFS file-system description data structure
* (per-superblock).
* @vfs_sb: VFS @struct super_block object
- * @bdi: backing device info object to make VFS happy and disable read-ahead
*
* @highest_inum: highest used inode number
* @max_sqnum: current global sequence number
@@ -1220,7 +1219,6 @@ struct ubifs_debug_info;
*/
struct ubifs_info {
struct super_block *vfs_sb;
- struct backing_dev_info bdi;
ino_t highest_inum;
unsigned long long max_sqnum;
@@ -1461,7 +1459,6 @@ extern const struct inode_operations ubifs_file_inode_operations;
extern const struct file_operations ubifs_dir_operations;
extern const struct inode_operations ubifs_dir_inode_operations;
extern const struct inode_operations ubifs_symlink_inode_operations;
-extern struct backing_dev_info ubifs_backing_dev_info;
extern struct ubifs_compressor *ubifs_compressors[UBIFS_COMPR_TYPES_CNT];
/* io.c */
diff --git a/fs/udf/file.c b/fs/udf/file.c
index e04cc0cdca9d..f5eb2d5b3bac 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -44,12 +44,12 @@ static void __udf_adinicb_readpage(struct page *page)
char *kaddr;
struct udf_inode_info *iinfo = UDF_I(inode);
- kaddr = kmap(page);
+ kaddr = kmap_atomic(page);
memcpy(kaddr, iinfo->i_ext.i_data + iinfo->i_lenEAttr, inode->i_size);
memset(kaddr + inode->i_size, 0, PAGE_SIZE - inode->i_size);
flush_dcache_page(page);
SetPageUptodate(page);
- kunmap(page);
+ kunmap_atomic(kaddr);
}
static int udf_adinicb_readpage(struct file *file, struct page *page)
@@ -70,11 +70,11 @@ static int udf_adinicb_writepage(struct page *page,
BUG_ON(!PageLocked(page));
- kaddr = kmap(page);
+ kaddr = kmap_atomic(page);
memcpy(iinfo->i_ext.i_data + iinfo->i_lenEAttr, kaddr, inode->i_size);
- mark_inode_dirty(inode);
SetPageUptodate(page);
- kunmap(page);
+ kunmap_atomic(kaddr);
+ mark_inode_dirty(inode);
unlock_page(page);
return 0;
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index a8d8f71ef8bd..98c510e17203 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -276,14 +276,14 @@ int udf_expand_file_adinicb(struct inode *inode)
return -ENOMEM;
if (!PageUptodate(page)) {
- kaddr = kmap(page);
+ kaddr = kmap_atomic(page);
memset(kaddr + iinfo->i_lenAlloc, 0x00,
PAGE_SIZE - iinfo->i_lenAlloc);
memcpy(kaddr, iinfo->i_ext.i_data + iinfo->i_lenEAttr,
iinfo->i_lenAlloc);
flush_dcache_page(page);
SetPageUptodate(page);
- kunmap(page);
+ kunmap_atomic(kaddr);
}
down_write(&iinfo->i_data_sem);
memset(iinfo->i_ext.i_data + iinfo->i_lenEAttr, 0x00,
@@ -300,11 +300,11 @@ int udf_expand_file_adinicb(struct inode *inode)
if (err) {
/* Restore everything back so that we don't lose data... */
lock_page(page);
- kaddr = kmap(page);
down_write(&iinfo->i_data_sem);
+ kaddr = kmap_atomic(page);
memcpy(iinfo->i_ext.i_data + iinfo->i_lenEAttr, kaddr,
inode->i_size);
- kunmap(page);
+ kunmap_atomic(kaddr);
unlock_page(page);
iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB;
inode->i_data.a_ops = &udf_adinicb_aops;
@@ -1535,7 +1535,7 @@ reread:
inode->i_data.a_ops = &udf_symlink_aops;
inode->i_op = &udf_symlink_inode_operations;
inode_nohighmem(inode);
- inode->i_mode = S_IFLNK | S_IRWXUGO;
+ inode->i_mode = S_IFLNK | 0777;
break;
case ICBTAG_FILE_TYPE_MAIN:
udf_debug("METADATA FILE-----\n");
@@ -1591,9 +1591,9 @@ static umode_t udf_convert_permissions(struct fileEntry *fe)
permissions = le32_to_cpu(fe->permissions);
flags = le16_to_cpu(fe->icbTag.flags);
- mode = ((permissions) & S_IRWXO) |
- ((permissions >> 2) & S_IRWXG) |
- ((permissions >> 4) & S_IRWXU) |
+ mode = ((permissions) & 0007) |
+ ((permissions >> 2) & 0070) |
+ ((permissions >> 4) & 0700) |
((flags & ICBTAG_FLAG_SETUID) ? S_ISUID : 0) |
((flags & ICBTAG_FLAG_SETGID) ? S_ISGID : 0) |
((flags & ICBTAG_FLAG_STICKY) ? S_ISVTX : 0);
@@ -1669,9 +1669,9 @@ static int udf_update_inode(struct inode *inode, int do_sync)
else
fe->gid = cpu_to_le32(i_gid_read(inode));
- udfperms = ((inode->i_mode & S_IRWXO)) |
- ((inode->i_mode & S_IRWXG) << 2) |
- ((inode->i_mode & S_IRWXU) << 4);
+ udfperms = ((inode->i_mode & 0007)) |
+ ((inode->i_mode & 0070) << 2) |
+ ((inode->i_mode & 0700) << 4);
udfperms |= (le32_to_cpu(fe->permissions) &
(FE_PERM_O_DELETE | FE_PERM_O_CHATTR |
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index babf48d0e553..385ee89d5824 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -906,7 +906,7 @@ out:
static int udf_symlink(struct inode *dir, struct dentry *dentry,
const char *symname)
{
- struct inode *inode = udf_new_inode(dir, S_IFLNK | S_IRWXUGO);
+ struct inode *inode = udf_new_inode(dir, S_IFLNK | 0777);
struct pathComponent *pc;
const char *compstart;
struct extent_position epos = {};
diff --git a/fs/utimes.c b/fs/utimes.c
index 32b15b3f6629..6571d8c848a0 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -1,14 +1,10 @@
-#include <linux/compiler.h>
#include <linux/file.h>
-#include <linux/fs.h>
-#include <linux/linkage.h>
#include <linux/mount.h>
#include <linux/namei.h>
-#include <linux/sched.h>
-#include <linux/stat.h>
#include <linux/utime.h>
#include <linux/syscalls.h>
#include <linux/uaccess.h>
+#include <linux/compat.h>
#include <asm/unistd.h>
#ifdef __ARCH_WANT_SYS_UTIME
@@ -219,3 +215,63 @@ SYSCALL_DEFINE2(utimes, char __user *, filename,
{
return sys_futimesat(AT_FDCWD, filename, utimes);
}
+
+#ifdef CONFIG_COMPAT
+/*
+ * Not all architectures have sys_utime, so implement this in terms
+ * of sys_utimes.
+ */
+COMPAT_SYSCALL_DEFINE2(utime, const char __user *, filename,
+ struct compat_utimbuf __user *, t)
+{
+ struct timespec tv[2];
+
+ if (t) {
+ if (get_user(tv[0].tv_sec, &t->actime) ||
+ get_user(tv[1].tv_sec, &t->modtime))
+ return -EFAULT;
+ tv[0].tv_nsec = 0;
+ tv[1].tv_nsec = 0;
+ }
+ return do_utimes(AT_FDCWD, filename, t ? tv : NULL, 0);
+}
+
+COMPAT_SYSCALL_DEFINE4(utimensat, unsigned int, dfd, const char __user *, filename, struct compat_timespec __user *, t, int, flags)
+{
+ struct timespec tv[2];
+
+ if (t) {
+ if (compat_get_timespec(&tv[0], &t[0]) ||
+ compat_get_timespec(&tv[1], &t[1]))
+ return -EFAULT;
+
+ if (tv[0].tv_nsec == UTIME_OMIT && tv[1].tv_nsec == UTIME_OMIT)
+ return 0;
+ }
+ return do_utimes(dfd, filename, t ? tv : NULL, flags);
+}
+
+COMPAT_SYSCALL_DEFINE3(futimesat, unsigned int, dfd, const char __user *, filename, struct compat_timeval __user *, t)
+{
+ struct timespec tv[2];
+
+ if (t) {
+ if (get_user(tv[0].tv_sec, &t[0].tv_sec) ||
+ get_user(tv[0].tv_nsec, &t[0].tv_usec) ||
+ get_user(tv[1].tv_sec, &t[1].tv_sec) ||
+ get_user(tv[1].tv_nsec, &t[1].tv_usec))
+ return -EFAULT;
+ if (tv[0].tv_nsec >= 1000000 || tv[0].tv_nsec < 0 ||
+ tv[1].tv_nsec >= 1000000 || tv[1].tv_nsec < 0)
+ return -EINVAL;
+ tv[0].tv_nsec *= 1000;
+ tv[1].tv_nsec *= 1000;
+ }
+ return do_utimes(dfd, filename, t ? tv : NULL, 0);
+}
+
+COMPAT_SYSCALL_DEFINE2(utimes, const char __user *, filename, struct compat_timeval __user *, t)
+{
+ return compat_sys_futimesat(AT_FDCWD, filename, t);
+}
+#endif
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 26ef1958b65b..5c90f82b8f6b 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -79,6 +79,7 @@ xfs-y += xfs_aops.o \
xfs_extent_busy.o \
xfs_file.o \
xfs_filestream.o \
+ xfs_fsmap.o \
xfs_fsops.o \
xfs_globals.o \
xfs_icache.o \
diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c
index 70a5b55e0870..780fc8986dab 100644
--- a/fs/xfs/kmem.c
+++ b/fs/xfs/kmem.c
@@ -48,7 +48,7 @@ kmem_alloc(size_t size, xfs_km_flags_t flags)
void *
kmem_zalloc_large(size_t size, xfs_km_flags_t flags)
{
- unsigned noio_flag = 0;
+ unsigned nofs_flag = 0;
void *ptr;
gfp_t lflags;
@@ -60,17 +60,17 @@ kmem_zalloc_large(size_t size, xfs_km_flags_t flags)
* __vmalloc() will allocate data pages and auxillary structures (e.g.
* pagetables) with GFP_KERNEL, yet we may be under GFP_NOFS context
* here. Hence we need to tell memory reclaim that we are in such a
- * context via PF_MEMALLOC_NOIO to prevent memory reclaim re-entering
+ * context via PF_MEMALLOC_NOFS to prevent memory reclaim re-entering
* the filesystem here and potentially deadlocking.
*/
- if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS))
- noio_flag = memalloc_noio_save();
+ if (flags & KM_NOFS)
+ nofs_flag = memalloc_nofs_save();
lflags = kmem_flags_convert(flags);
ptr = __vmalloc(size, lflags | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL);
- if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS))
- memalloc_noio_restore(noio_flag);
+ if (flags & KM_NOFS)
+ memalloc_nofs_restore(nofs_flag);
return ptr;
}
diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h
index f0fc84fcaac2..d6ea520162b2 100644
--- a/fs/xfs/kmem.h
+++ b/fs/xfs/kmem.h
@@ -50,7 +50,7 @@ kmem_flags_convert(xfs_km_flags_t flags)
lflags = GFP_ATOMIC | __GFP_NOWARN;
} else {
lflags = GFP_KERNEL | __GFP_NOWARN;
- if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS))
+ if (flags & KM_NOFS)
lflags &= ~__GFP_FS;
}
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 369adcc18c02..7486401ccbd3 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -2868,3 +2868,60 @@ err:
xfs_trans_brelse(tp, agbp);
return error;
}
+
+struct xfs_alloc_query_range_info {
+ xfs_alloc_query_range_fn fn;
+ void *priv;
+};
+
+/* Format btree record and pass to our callback. */
+STATIC int
+xfs_alloc_query_range_helper(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_rec *rec,
+ void *priv)
+{
+ struct xfs_alloc_query_range_info *query = priv;
+ struct xfs_alloc_rec_incore irec;
+
+ irec.ar_startblock = be32_to_cpu(rec->alloc.ar_startblock);
+ irec.ar_blockcount = be32_to_cpu(rec->alloc.ar_blockcount);
+ return query->fn(cur, &irec, query->priv);
+}
+
+/* Find all free space within a given range of blocks. */
+int
+xfs_alloc_query_range(
+ struct xfs_btree_cur *cur,
+ struct xfs_alloc_rec_incore *low_rec,
+ struct xfs_alloc_rec_incore *high_rec,
+ xfs_alloc_query_range_fn fn,
+ void *priv)
+{
+ union xfs_btree_irec low_brec;
+ union xfs_btree_irec high_brec;
+ struct xfs_alloc_query_range_info query;
+
+ ASSERT(cur->bc_btnum == XFS_BTNUM_BNO);
+ low_brec.a = *low_rec;
+ high_brec.a = *high_rec;
+ query.priv = priv;
+ query.fn = fn;
+ return xfs_btree_query_range(cur, &low_brec, &high_brec,
+ xfs_alloc_query_range_helper, &query);
+}
+
+/* Find all free space records. */
+int
+xfs_alloc_query_all(
+ struct xfs_btree_cur *cur,
+ xfs_alloc_query_range_fn fn,
+ void *priv)
+{
+ struct xfs_alloc_query_range_info query;
+
+ ASSERT(cur->bc_btnum == XFS_BTNUM_BNO);
+ query.priv = priv;
+ query.fn = fn;
+ return xfs_btree_query_all(cur, xfs_alloc_query_range_helper, &query);
+}
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index 2a8d0fa6fbbe..77d9c27330ab 100644
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -219,4 +219,16 @@ int xfs_free_extent_fix_freelist(struct xfs_trans *tp, xfs_agnumber_t agno,
xfs_extlen_t xfs_prealloc_blocks(struct xfs_mount *mp);
+typedef int (*xfs_alloc_query_range_fn)(
+ struct xfs_btree_cur *cur,
+ struct xfs_alloc_rec_incore *rec,
+ void *priv);
+
+int xfs_alloc_query_range(struct xfs_btree_cur *cur,
+ struct xfs_alloc_rec_incore *low_rec,
+ struct xfs_alloc_rec_incore *high_rec,
+ xfs_alloc_query_range_fn fn, void *priv);
+int xfs_alloc_query_all(struct xfs_btree_cur *cur, xfs_alloc_query_range_fn fn,
+ void *priv);
+
#endif /* __XFS_ALLOC_H__ */
diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c
index efb467b10a71..e1fcfe7f0a9a 100644
--- a/fs/xfs/libxfs/xfs_alloc_btree.c
+++ b/fs/xfs/libxfs/xfs_alloc_btree.c
@@ -205,19 +205,37 @@ xfs_allocbt_init_key_from_rec(
union xfs_btree_key *key,
union xfs_btree_rec *rec)
{
- ASSERT(rec->alloc.ar_startblock != 0);
-
key->alloc.ar_startblock = rec->alloc.ar_startblock;
key->alloc.ar_blockcount = rec->alloc.ar_blockcount;
}
STATIC void
+xfs_bnobt_init_high_key_from_rec(
+ union xfs_btree_key *key,
+ union xfs_btree_rec *rec)
+{
+ __u32 x;
+
+ x = be32_to_cpu(rec->alloc.ar_startblock);
+ x += be32_to_cpu(rec->alloc.ar_blockcount) - 1;
+ key->alloc.ar_startblock = cpu_to_be32(x);
+ key->alloc.ar_blockcount = 0;
+}
+
+STATIC void
+xfs_cntbt_init_high_key_from_rec(
+ union xfs_btree_key *key,
+ union xfs_btree_rec *rec)
+{
+ key->alloc.ar_blockcount = rec->alloc.ar_blockcount;
+ key->alloc.ar_startblock = 0;
+}
+
+STATIC void
xfs_allocbt_init_rec_from_cur(
struct xfs_btree_cur *cur,
union xfs_btree_rec *rec)
{
- ASSERT(cur->bc_rec.a.ar_startblock != 0);
-
rec->alloc.ar_startblock = cpu_to_be32(cur->bc_rec.a.ar_startblock);
rec->alloc.ar_blockcount = cpu_to_be32(cur->bc_rec.a.ar_blockcount);
}
@@ -236,18 +254,24 @@ xfs_allocbt_init_ptr_from_cur(
}
STATIC __int64_t
-xfs_allocbt_key_diff(
+xfs_bnobt_key_diff(
struct xfs_btree_cur *cur,
union xfs_btree_key *key)
{
xfs_alloc_rec_incore_t *rec = &cur->bc_rec.a;
xfs_alloc_key_t *kp = &key->alloc;
- __int64_t diff;
- if (cur->bc_btnum == XFS_BTNUM_BNO) {
- return (__int64_t)be32_to_cpu(kp->ar_startblock) -
- rec->ar_startblock;
- }
+ return (__int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock;
+}
+
+STATIC __int64_t
+xfs_cntbt_key_diff(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_key *key)
+{
+ xfs_alloc_rec_incore_t *rec = &cur->bc_rec.a;
+ xfs_alloc_key_t *kp = &key->alloc;
+ __int64_t diff;
diff = (__int64_t)be32_to_cpu(kp->ar_blockcount) - rec->ar_blockcount;
if (diff)
@@ -256,6 +280,33 @@ xfs_allocbt_key_diff(
return (__int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock;
}
+STATIC __int64_t
+xfs_bnobt_diff_two_keys(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_key *k1,
+ union xfs_btree_key *k2)
+{
+ return (__int64_t)be32_to_cpu(k1->alloc.ar_startblock) -
+ be32_to_cpu(k2->alloc.ar_startblock);
+}
+
+STATIC __int64_t
+xfs_cntbt_diff_two_keys(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_key *k1,
+ union xfs_btree_key *k2)
+{
+ __int64_t diff;
+
+ diff = be32_to_cpu(k1->alloc.ar_blockcount) -
+ be32_to_cpu(k2->alloc.ar_blockcount);
+ if (diff)
+ return diff;
+
+ return be32_to_cpu(k1->alloc.ar_startblock) -
+ be32_to_cpu(k2->alloc.ar_startblock);
+}
+
static bool
xfs_allocbt_verify(
struct xfs_buf *bp)
@@ -346,44 +397,54 @@ const struct xfs_buf_ops xfs_allocbt_buf_ops = {
#if defined(DEBUG) || defined(XFS_WARN)
STATIC int
-xfs_allocbt_keys_inorder(
+xfs_bnobt_keys_inorder(
struct xfs_btree_cur *cur,
union xfs_btree_key *k1,
union xfs_btree_key *k2)
{
- if (cur->bc_btnum == XFS_BTNUM_BNO) {
- return be32_to_cpu(k1->alloc.ar_startblock) <
- be32_to_cpu(k2->alloc.ar_startblock);
- } else {
- return be32_to_cpu(k1->alloc.ar_blockcount) <
- be32_to_cpu(k2->alloc.ar_blockcount) ||
- (k1->alloc.ar_blockcount == k2->alloc.ar_blockcount &&
- be32_to_cpu(k1->alloc.ar_startblock) <
- be32_to_cpu(k2->alloc.ar_startblock));
- }
+ return be32_to_cpu(k1->alloc.ar_startblock) <
+ be32_to_cpu(k2->alloc.ar_startblock);
}
STATIC int
-xfs_allocbt_recs_inorder(
+xfs_bnobt_recs_inorder(
struct xfs_btree_cur *cur,
union xfs_btree_rec *r1,
union xfs_btree_rec *r2)
{
- if (cur->bc_btnum == XFS_BTNUM_BNO) {
- return be32_to_cpu(r1->alloc.ar_startblock) +
- be32_to_cpu(r1->alloc.ar_blockcount) <=
- be32_to_cpu(r2->alloc.ar_startblock);
- } else {
- return be32_to_cpu(r1->alloc.ar_blockcount) <
- be32_to_cpu(r2->alloc.ar_blockcount) ||
- (r1->alloc.ar_blockcount == r2->alloc.ar_blockcount &&
- be32_to_cpu(r1->alloc.ar_startblock) <
- be32_to_cpu(r2->alloc.ar_startblock));
- }
+ return be32_to_cpu(r1->alloc.ar_startblock) +
+ be32_to_cpu(r1->alloc.ar_blockcount) <=
+ be32_to_cpu(r2->alloc.ar_startblock);
+}
+
+STATIC int
+xfs_cntbt_keys_inorder(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_key *k1,
+ union xfs_btree_key *k2)
+{
+ return be32_to_cpu(k1->alloc.ar_blockcount) <
+ be32_to_cpu(k2->alloc.ar_blockcount) ||
+ (k1->alloc.ar_blockcount == k2->alloc.ar_blockcount &&
+ be32_to_cpu(k1->alloc.ar_startblock) <
+ be32_to_cpu(k2->alloc.ar_startblock));
}
-#endif /* DEBUG */
-static const struct xfs_btree_ops xfs_allocbt_ops = {
+STATIC int
+xfs_cntbt_recs_inorder(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_rec *r1,
+ union xfs_btree_rec *r2)
+{
+ return be32_to_cpu(r1->alloc.ar_blockcount) <
+ be32_to_cpu(r2->alloc.ar_blockcount) ||
+ (r1->alloc.ar_blockcount == r2->alloc.ar_blockcount &&
+ be32_to_cpu(r1->alloc.ar_startblock) <
+ be32_to_cpu(r2->alloc.ar_startblock));
+}
+#endif /* DEBUG */
+
+static const struct xfs_btree_ops xfs_bnobt_ops = {
.rec_len = sizeof(xfs_alloc_rec_t),
.key_len = sizeof(xfs_alloc_key_t),
@@ -395,13 +456,39 @@ static const struct xfs_btree_ops xfs_allocbt_ops = {
.get_minrecs = xfs_allocbt_get_minrecs,
.get_maxrecs = xfs_allocbt_get_maxrecs,
.init_key_from_rec = xfs_allocbt_init_key_from_rec,
+ .init_high_key_from_rec = xfs_bnobt_init_high_key_from_rec,
.init_rec_from_cur = xfs_allocbt_init_rec_from_cur,
.init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur,
- .key_diff = xfs_allocbt_key_diff,
+ .key_diff = xfs_bnobt_key_diff,
.buf_ops = &xfs_allocbt_buf_ops,
+ .diff_two_keys = xfs_bnobt_diff_two_keys,
#if defined(DEBUG) || defined(XFS_WARN)
- .keys_inorder = xfs_allocbt_keys_inorder,
- .recs_inorder = xfs_allocbt_recs_inorder,
+ .keys_inorder = xfs_bnobt_keys_inorder,
+ .recs_inorder = xfs_bnobt_recs_inorder,
+#endif
+};
+
+static const struct xfs_btree_ops xfs_cntbt_ops = {
+ .rec_len = sizeof(xfs_alloc_rec_t),
+ .key_len = sizeof(xfs_alloc_key_t),
+
+ .dup_cursor = xfs_allocbt_dup_cursor,
+ .set_root = xfs_allocbt_set_root,
+ .alloc_block = xfs_allocbt_alloc_block,
+ .free_block = xfs_allocbt_free_block,
+ .update_lastrec = xfs_allocbt_update_lastrec,
+ .get_minrecs = xfs_allocbt_get_minrecs,
+ .get_maxrecs = xfs_allocbt_get_maxrecs,
+ .init_key_from_rec = xfs_allocbt_init_key_from_rec,
+ .init_high_key_from_rec = xfs_cntbt_init_high_key_from_rec,
+ .init_rec_from_cur = xfs_allocbt_init_rec_from_cur,
+ .init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur,
+ .key_diff = xfs_cntbt_key_diff,
+ .buf_ops = &xfs_allocbt_buf_ops,
+ .diff_two_keys = xfs_cntbt_diff_two_keys,
+#if defined(DEBUG) || defined(XFS_WARN)
+ .keys_inorder = xfs_cntbt_keys_inorder,
+ .recs_inorder = xfs_cntbt_recs_inorder,
#endif
};
@@ -427,16 +514,15 @@ xfs_allocbt_init_cursor(
cur->bc_mp = mp;
cur->bc_btnum = btnum;
cur->bc_blocklog = mp->m_sb.sb_blocklog;
- cur->bc_ops = &xfs_allocbt_ops;
- if (btnum == XFS_BTNUM_BNO)
- cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtb_2);
- else
- cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtc_2);
if (btnum == XFS_BTNUM_CNT) {
+ cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtc_2);
+ cur->bc_ops = &xfs_cntbt_ops;
cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]);
cur->bc_flags = XFS_BTREE_LASTREC_UPDATE;
} else {
+ cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtb_2);
+ cur->bc_ops = &xfs_bnobt_ops;
cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]);
}
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 9bd104f32908..f02eb7673392 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -764,7 +764,6 @@ xfs_bmap_extents_to_btree(
args.fsbno = XFS_INO_TO_FSB(mp, ip->i_ino);
} else if (dfops->dop_low) {
args.type = XFS_ALLOCTYPE_START_BNO;
-try_another_ag:
args.fsbno = *firstblock;
} else {
args.type = XFS_ALLOCTYPE_NEAR_BNO;
@@ -779,20 +778,6 @@ try_another_ag:
return error;
}
- /*
- * During a CoW operation, the allocation and bmbt updates occur in
- * different transactions. The mapping code tries to put new bmbt
- * blocks near extents being mapped, but the only way to guarantee this
- * is if the alloc and the mapping happen in a single transaction that
- * has a block reservation. That isn't the case here, so if we run out
- * of space we'll try again with another AG.
- */
- if (xfs_sb_version_hasreflink(&cur->bc_mp->m_sb) &&
- args.fsbno == NULLFSBLOCK &&
- args.type == XFS_ALLOCTYPE_NEAR_BNO) {
- args.type = XFS_ALLOCTYPE_FIRST_AG;
- goto try_another_ag;
- }
if (WARN_ON_ONCE(args.fsbno == NULLFSBLOCK)) {
xfs_iroot_realloc(ip, -1, whichfork);
xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
@@ -925,7 +910,6 @@ xfs_bmap_local_to_extents(
* file currently fits in an inode.
*/
if (*firstblock == NULLFSBLOCK) {
-try_another_ag:
args.fsbno = XFS_INO_TO_FSB(args.mp, ip->i_ino);
args.type = XFS_ALLOCTYPE_START_BNO;
} else {
@@ -938,19 +922,6 @@ try_another_ag:
if (error)
goto done;
- /*
- * During a CoW operation, the allocation and bmbt updates occur in
- * different transactions. The mapping code tries to put new bmbt
- * blocks near extents being mapped, but the only way to guarantee this
- * is if the alloc and the mapping happen in a single transaction that
- * has a block reservation. That isn't the case here, so if we run out
- * of space we'll try again with another AG.
- */
- if (xfs_sb_version_hasreflink(&ip->i_mount->m_sb) &&
- args.fsbno == NULLFSBLOCK &&
- args.type == XFS_ALLOCTYPE_NEAR_BNO) {
- goto try_another_ag;
- }
/* Can't fail, the space was reserved. */
ASSERT(args.fsbno != NULLFSBLOCK);
ASSERT(args.len == 1);
@@ -1260,7 +1231,6 @@ xfs_bmap_read_extents(
xfs_fsblock_t bno; /* block # of "block" */
xfs_buf_t *bp; /* buffer for "block" */
int error; /* error return value */
- xfs_exntfmt_t exntf; /* XFS_EXTFMT_NOSTATE, if checking */
xfs_extnum_t i, j; /* index into the extents list */
xfs_ifork_t *ifp; /* fork structure */
int level; /* btree level, for checking */
@@ -1271,8 +1241,6 @@ xfs_bmap_read_extents(
mp = ip->i_mount;
ifp = XFS_IFORK_PTR(ip, whichfork);
- exntf = (whichfork != XFS_DATA_FORK) ? XFS_EXTFMT_NOSTATE :
- XFS_EXTFMT_INODE(ip);
block = ifp->if_broot;
/*
* Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out.
@@ -1340,18 +1308,9 @@ xfs_bmap_read_extents(
xfs_bmbt_rec_host_t *trp = xfs_iext_get_ext(ifp, i);
trp->l0 = be64_to_cpu(frp->l0);
trp->l1 = be64_to_cpu(frp->l1);
- }
- if (exntf == XFS_EXTFMT_NOSTATE) {
- /*
- * Check all attribute bmap btree records and
- * any "older" data bmap btree records for a
- * set bit in the "extent flag" position.
- */
- if (unlikely(xfs_check_nostate_extents(ifp,
- start, num_recs))) {
+ if (!xfs_bmbt_validate_extent(mp, whichfork, trp)) {
XFS_ERROR_REPORT("xfs_bmap_read_extents(2)",
- XFS_ERRLEVEL_LOW,
- ip->i_mount);
+ XFS_ERRLEVEL_LOW, mp);
goto error0;
}
}
@@ -2879,27 +2838,30 @@ xfs_bmap_add_extent_hole_delay(
*/
STATIC int /* error */
xfs_bmap_add_extent_hole_real(
- struct xfs_bmalloca *bma,
- int whichfork)
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ int whichfork,
+ xfs_extnum_t *idx,
+ struct xfs_btree_cur **curp,
+ struct xfs_bmbt_irec *new,
+ xfs_fsblock_t *first,
+ struct xfs_defer_ops *dfops,
+ int *logflagsp)
{
- struct xfs_bmbt_irec *new = &bma->got;
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_btree_cur *cur = *curp;
int error; /* error return value */
int i; /* temp state */
- xfs_ifork_t *ifp; /* inode fork pointer */
xfs_bmbt_irec_t left; /* left neighbor extent entry */
xfs_bmbt_irec_t right; /* right neighbor extent entry */
int rval=0; /* return value (logging flags) */
int state; /* state bits, accessed thru macros */
- struct xfs_mount *mp;
- mp = bma->ip->i_mount;
- ifp = XFS_IFORK_PTR(bma->ip, whichfork);
-
- ASSERT(bma->idx >= 0);
- ASSERT(bma->idx <= xfs_iext_count(ifp));
+ ASSERT(*idx >= 0);
+ ASSERT(*idx <= xfs_iext_count(ifp));
ASSERT(!isnullstartblock(new->br_startblock));
- ASSERT(!bma->cur ||
- !(bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL));
+ ASSERT(!cur || !(cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL));
XFS_STATS_INC(mp, xs_add_exlist);
@@ -2912,9 +2874,9 @@ xfs_bmap_add_extent_hole_real(
/*
* Check and set flags if this segment has a left neighbor.
*/
- if (bma->idx > 0) {
+ if (*idx > 0) {
state |= BMAP_LEFT_VALID;
- xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx - 1), &left);
+ xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &left);
if (isnullstartblock(left.br_startblock))
state |= BMAP_LEFT_DELAY;
}
@@ -2923,9 +2885,9 @@ xfs_bmap_add_extent_hole_real(
* Check and set flags if this segment has a current value.
* Not true if we're inserting into the "hole" at eof.
*/
- if (bma->idx < xfs_iext_count(ifp)) {
+ if (*idx < xfs_iext_count(ifp)) {
state |= BMAP_RIGHT_VALID;
- xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx), &right);
+ xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &right);
if (isnullstartblock(right.br_startblock))
state |= BMAP_RIGHT_DELAY;
}
@@ -2962,36 +2924,36 @@ xfs_bmap_add_extent_hole_real(
* left and on the right.
* Merge all three into a single extent record.
*/
- --bma->idx;
- trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
- xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx),
+ --*idx;
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+ xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
left.br_blockcount + new->br_blockcount +
right.br_blockcount);
- trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
- xfs_iext_remove(bma->ip, bma->idx + 1, 1, state);
+ xfs_iext_remove(ip, *idx + 1, 1, state);
- XFS_IFORK_NEXT_SET(bma->ip, whichfork,
- XFS_IFORK_NEXTENTS(bma->ip, whichfork) - 1);
- if (bma->cur == NULL) {
+ XFS_IFORK_NEXT_SET(ip, whichfork,
+ XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
+ if (cur == NULL) {
rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
} else {
rval = XFS_ILOG_CORE;
- error = xfs_bmbt_lookup_eq(bma->cur, right.br_startoff,
+ error = xfs_bmbt_lookup_eq(cur, right.br_startoff,
right.br_startblock, right.br_blockcount,
&i);
if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
- error = xfs_btree_delete(bma->cur, &i);
+ error = xfs_btree_delete(cur, &i);
if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
- error = xfs_btree_decrement(bma->cur, 0, &i);
+ error = xfs_btree_decrement(cur, 0, &i);
if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
- error = xfs_bmbt_update(bma->cur, left.br_startoff,
+ error = xfs_bmbt_update(cur, left.br_startoff,
left.br_startblock,
left.br_blockcount +
new->br_blockcount +
@@ -3008,23 +2970,23 @@ xfs_bmap_add_extent_hole_real(
* on the left.
* Merge the new allocation with the left neighbor.
*/
- --bma->idx;
- trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
- xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx),
+ --*idx;
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+ xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
left.br_blockcount + new->br_blockcount);
- trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
- if (bma->cur == NULL) {
+ if (cur == NULL) {
rval = xfs_ilog_fext(whichfork);
} else {
rval = 0;
- error = xfs_bmbt_lookup_eq(bma->cur, left.br_startoff,
+ error = xfs_bmbt_lookup_eq(cur, left.br_startoff,
left.br_startblock, left.br_blockcount,
&i);
if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
- error = xfs_bmbt_update(bma->cur, left.br_startoff,
+ error = xfs_bmbt_update(cur, left.br_startoff,
left.br_startblock,
left.br_blockcount +
new->br_blockcount,
@@ -3040,25 +3002,25 @@ xfs_bmap_add_extent_hole_real(
* on the right.
* Merge the new allocation with the right neighbor.
*/
- trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
- xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, bma->idx),
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+ xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx),
new->br_startoff, new->br_startblock,
new->br_blockcount + right.br_blockcount,
right.br_state);
- trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
- if (bma->cur == NULL) {
+ if (cur == NULL) {
rval = xfs_ilog_fext(whichfork);
} else {
rval = 0;
- error = xfs_bmbt_lookup_eq(bma->cur,
+ error = xfs_bmbt_lookup_eq(cur,
right.br_startoff,
right.br_startblock,
right.br_blockcount, &i);
if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
- error = xfs_bmbt_update(bma->cur, new->br_startoff,
+ error = xfs_bmbt_update(cur, new->br_startoff,
new->br_startblock,
new->br_blockcount +
right.br_blockcount,
@@ -3074,22 +3036,22 @@ xfs_bmap_add_extent_hole_real(
* real allocation.
* Insert a new entry.
*/
- xfs_iext_insert(bma->ip, bma->idx, 1, new, state);
- XFS_IFORK_NEXT_SET(bma->ip, whichfork,
- XFS_IFORK_NEXTENTS(bma->ip, whichfork) + 1);
- if (bma->cur == NULL) {
+ xfs_iext_insert(ip, *idx, 1, new, state);
+ XFS_IFORK_NEXT_SET(ip, whichfork,
+ XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
+ if (cur == NULL) {
rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
} else {
rval = XFS_ILOG_CORE;
- error = xfs_bmbt_lookup_eq(bma->cur,
+ error = xfs_bmbt_lookup_eq(cur,
new->br_startoff,
new->br_startblock,
new->br_blockcount, &i);
if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done);
- bma->cur->bc_rec.b.br_state = new->br_state;
- error = xfs_btree_insert(bma->cur, &i);
+ cur->bc_rec.b.br_state = new->br_state;
+ error = xfs_btree_insert(cur, &i);
if (error)
goto done;
XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
@@ -3098,30 +3060,30 @@ xfs_bmap_add_extent_hole_real(
}
/* add reverse mapping */
- error = xfs_rmap_map_extent(mp, bma->dfops, bma->ip, whichfork, new);
+ error = xfs_rmap_map_extent(mp, dfops, ip, whichfork, new);
if (error)
goto done;
/* convert to a btree if necessary */
- if (xfs_bmap_needs_btree(bma->ip, whichfork)) {
+ if (xfs_bmap_needs_btree(ip, whichfork)) {
int tmp_logflags; /* partial log flag return val */
- ASSERT(bma->cur == NULL);
- error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
- bma->firstblock, bma->dfops, &bma->cur,
+ ASSERT(cur == NULL);
+ error = xfs_bmap_extents_to_btree(tp, ip, first, dfops, curp,
0, &tmp_logflags, whichfork);
- bma->logflags |= tmp_logflags;
+ *logflagsp |= tmp_logflags;
+ cur = *curp;
if (error)
goto done;
}
/* clear out the allocated field, done with it now in any case. */
- if (bma->cur)
- bma->cur->bc_private.b.allocated = 0;
+ if (cur)
+ cur->bc_private.b.allocated = 0;
- xfs_bmap_check_leaf_extents(bma->cur, bma->ip, whichfork);
+ xfs_bmap_check_leaf_extents(cur, ip, whichfork);
done:
- bma->logflags |= rval;
+ *logflagsp |= rval;
return error;
}
@@ -3853,60 +3815,6 @@ xfs_bmap_btalloc(
}
/*
- * For a remap operation, just "allocate" an extent at the address that the
- * caller passed in, and ensure that the AGFL is the right size. The caller
- * will then map the "allocated" extent into the file somewhere.
- */
-STATIC int
-xfs_bmap_remap_alloc(
- struct xfs_bmalloca *ap)
-{
- struct xfs_trans *tp = ap->tp;
- struct xfs_mount *mp = tp->t_mountp;
- xfs_agblock_t bno;
- struct xfs_alloc_arg args;
- int error;
-
- /*
- * validate that the block number is legal - the enables us to detect
- * and handle a silent filesystem corruption rather than crashing.
- */
- memset(&args, 0, sizeof(struct xfs_alloc_arg));
- args.tp = ap->tp;
- args.mp = ap->tp->t_mountp;
- bno = *ap->firstblock;
- args.agno = XFS_FSB_TO_AGNO(mp, bno);
- args.agbno = XFS_FSB_TO_AGBNO(mp, bno);
- if (args.agno >= mp->m_sb.sb_agcount ||
- args.agbno >= mp->m_sb.sb_agblocks)
- return -EFSCORRUPTED;
-
- /* "Allocate" the extent from the range we passed in. */
- trace_xfs_bmap_remap_alloc(ap->ip, *ap->firstblock, ap->length);
- ap->blkno = bno;
- ap->ip->i_d.di_nblocks += ap->length;
- xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
-
- /* Fix the freelist, like a real allocator does. */
- args.datatype = ap->datatype;
- args.pag = xfs_perag_get(args.mp, args.agno);
- ASSERT(args.pag);
-
- /*
- * The freelist fixing code will decline the allocation if
- * the size and shape of the free space doesn't allow for
- * allocating the extent and updating all the metadata that
- * happens during an allocation. We're remapping, not
- * allocating, so skip that check by pretending to be freeing.
- */
- error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING);
- xfs_perag_put(args.pag);
- if (error)
- trace_xfs_bmap_remap_alloc_error(ap->ip, error, _RET_IP_);
- return error;
-}
-
-/*
* xfs_bmap_alloc is called by xfs_bmapi to allocate an extent for a file.
* It figures out where to ask the underlying allocator to put the new extent.
*/
@@ -3914,8 +3822,6 @@ STATIC int
xfs_bmap_alloc(
struct xfs_bmalloca *ap) /* bmap alloc argument struct */
{
- if (ap->flags & XFS_BMAPI_REMAP)
- return xfs_bmap_remap_alloc(ap);
if (XFS_IS_REALTIME_INODE(ap->ip) &&
xfs_alloc_is_userdata(ap->datatype))
return xfs_bmap_rtalloc(ap);
@@ -4386,7 +4292,9 @@ xfs_bmapi_allocate(
if (bma->wasdel)
error = xfs_bmap_add_extent_delay_real(bma, whichfork);
else
- error = xfs_bmap_add_extent_hole_real(bma, whichfork);
+ error = xfs_bmap_add_extent_hole_real(bma->tp, bma->ip,
+ whichfork, &bma->idx, &bma->cur, &bma->got,
+ bma->firstblock, bma->dfops, &bma->logflags);
bma->logflags |= tmp_logflags;
if (error)
@@ -4549,9 +4457,7 @@ xfs_bmapi_write(
ASSERT(len > 0);
ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL);
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
- ASSERT(!(flags & XFS_BMAPI_REMAP) || whichfork == XFS_DATA_FORK);
- ASSERT(!(flags & XFS_BMAPI_PREALLOC) || !(flags & XFS_BMAPI_REMAP));
- ASSERT(!(flags & XFS_BMAPI_CONVERT) || !(flags & XFS_BMAPI_REMAP));
+ ASSERT(!(flags & XFS_BMAPI_REMAP));
/* zeroing is for currently only for data extents, not metadata */
ASSERT((flags & (XFS_BMAPI_METADATA | XFS_BMAPI_ZERO)) !=
@@ -4635,13 +4541,8 @@ xfs_bmapi_write(
} else {
need_alloc = true;
}
- } else {
- /*
- * Make sure we only reflink into a hole.
- */
- ASSERT(!(flags & XFS_BMAPI_REMAP));
- if (isnullstartblock(bma.got.br_startblock))
- wasdelay = true;
+ } else if (isnullstartblock(bma.got.br_startblock)) {
+ wasdelay = true;
}
/*
@@ -4770,6 +4671,93 @@ error0:
return error;
}
+static int
+xfs_bmapi_remap(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ xfs_fileoff_t bno,
+ xfs_filblks_t len,
+ xfs_fsblock_t startblock,
+ struct xfs_defer_ops *dfops)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+ struct xfs_btree_cur *cur = NULL;
+ xfs_fsblock_t firstblock = NULLFSBLOCK;
+ struct xfs_bmbt_irec got;
+ xfs_extnum_t idx;
+ int logflags = 0, error;
+
+ ASSERT(len > 0);
+ ASSERT(len <= (xfs_filblks_t)MAXEXTLEN);
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+
+ if (unlikely(XFS_TEST_ERROR(
+ (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_EXTENTS &&
+ XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_BTREE),
+ mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
+ XFS_ERROR_REPORT("xfs_bmapi_remap", XFS_ERRLEVEL_LOW, mp);
+ return -EFSCORRUPTED;
+ }
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return -EIO;
+
+ if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+ error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
+ if (error)
+ return error;
+ }
+
+ if (xfs_iext_lookup_extent(ip, ifp, bno, &idx, &got)) {
+ /* make sure we only reflink into a hole. */
+ ASSERT(got.br_startoff > bno);
+ ASSERT(got.br_startoff - bno >= len);
+ }
+
+ ip->i_d.di_nblocks += len;
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+ if (ifp->if_flags & XFS_IFBROOT) {
+ cur = xfs_bmbt_init_cursor(mp, tp, ip, XFS_DATA_FORK);
+ cur->bc_private.b.firstblock = firstblock;
+ cur->bc_private.b.dfops = dfops;
+ cur->bc_private.b.flags = 0;
+ }
+
+ got.br_startoff = bno;
+ got.br_startblock = startblock;
+ got.br_blockcount = len;
+ got.br_state = XFS_EXT_NORM;
+
+ error = xfs_bmap_add_extent_hole_real(tp, ip, XFS_DATA_FORK, &idx, &cur,
+ &got, &firstblock, dfops, &logflags);
+ if (error)
+ goto error0;
+
+ if (xfs_bmap_wants_extents(ip, XFS_DATA_FORK)) {
+ int tmp_logflags = 0;
+
+ error = xfs_bmap_btree_to_extents(tp, ip, cur,
+ &tmp_logflags, XFS_DATA_FORK);
+ logflags |= tmp_logflags;
+ }
+
+error0:
+ if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS)
+ logflags &= ~XFS_ILOG_DEXT;
+ else if (ip->i_d.di_format != XFS_DINODE_FMT_BTREE)
+ logflags &= ~XFS_ILOG_DBROOT;
+
+ if (logflags)
+ xfs_trans_log_inode(tp, ip, logflags);
+ if (cur) {
+ xfs_btree_del_cursor(cur,
+ error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+ }
+ return error;
+}
+
/*
* When a delalloc extent is split (e.g., due to a hole punch), the original
* indlen reservation must be shared across the two new extents that are left
@@ -4887,7 +4875,7 @@ xfs_bmap_del_extent_delay(
ASSERT(got_endoff >= del_endoff);
if (isrt) {
- int64_t rtexts = XFS_FSB_TO_B(mp, del->br_blockcount);
+ uint64_t rtexts = XFS_FSB_TO_B(mp, del->br_blockcount);
do_div(rtexts, mp->m_sb.sb_rextsize);
xfs_mod_frextents(mp, rtexts);
@@ -6488,27 +6476,15 @@ xfs_bmap_finish_one(
xfs_filblks_t blockcount,
xfs_exntst_t state)
{
- struct xfs_bmbt_irec bmap;
- int nimaps = 1;
- xfs_fsblock_t firstfsb;
- int flags = XFS_BMAPI_REMAP;
- int done;
- int error = 0;
-
- bmap.br_startblock = startblock;
- bmap.br_startoff = startoff;
- bmap.br_blockcount = blockcount;
- bmap.br_state = state;
+ int error = 0, done;
trace_xfs_bmap_deferred(tp->t_mountp,
XFS_FSB_TO_AGNO(tp->t_mountp, startblock), type,
XFS_FSB_TO_AGBNO(tp->t_mountp, startblock),
ip->i_ino, whichfork, startoff, blockcount, state);
- if (whichfork != XFS_DATA_FORK && whichfork != XFS_ATTR_FORK)
+ if (WARN_ON_ONCE(whichfork != XFS_DATA_FORK))
return -EFSCORRUPTED;
- if (whichfork == XFS_ATTR_FORK)
- flags |= XFS_BMAPI_ATTRFORK;
if (XFS_TEST_ERROR(false, tp->t_mountp,
XFS_ERRTAG_BMAP_FINISH_ONE,
@@ -6517,16 +6493,12 @@ xfs_bmap_finish_one(
switch (type) {
case XFS_BMAP_MAP:
- firstfsb = bmap.br_startblock;
- error = xfs_bmapi_write(tp, ip, bmap.br_startoff,
- bmap.br_blockcount, flags, &firstfsb,
- bmap.br_blockcount, &bmap, &nimaps,
- dfops);
+ error = xfs_bmapi_remap(tp, ip, startoff, blockcount,
+ startblock, dfops);
break;
case XFS_BMAP_UNMAP:
- error = xfs_bunmapi(tp, ip, bmap.br_startoff,
- bmap.br_blockcount, flags, 1, &firstfsb,
- dfops, &done);
+ error = xfs_bunmapi(tp, ip, startoff, blockcount,
+ XFS_BMAPI_REMAP, 1, &startblock, dfops, &done);
ASSERT(done);
break;
default:
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index cdef87db5262..c35a14fa1527 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -172,6 +172,18 @@ static inline int xfs_bmapi_whichfork(int bmapi_flags)
/*
+ * Return true if the extent is a real, allocated extent, or false if it is a
+ * delayed allocation, and unwritten extent or a hole.
+ */
+static inline bool xfs_bmap_is_real_extent(struct xfs_bmbt_irec *irec)
+{
+ return irec->br_state != XFS_EXT_UNWRITTEN &&
+ irec->br_startblock != HOLESTARTBLOCK &&
+ irec->br_startblock != DELAYSTARTBLOCK &&
+ !isnullstartblock(irec->br_startblock);
+}
+
+/*
* This macro is used to determine how many extents will be shifted
* in one write transaction. We could require two splits,
* an extent move on the first and an extent merge on the second,
@@ -232,8 +244,6 @@ int xfs_bmap_del_extent_delay(struct xfs_inode *ip, int whichfork,
struct xfs_bmbt_irec *del);
void xfs_bmap_del_extent_cow(struct xfs_inode *ip, xfs_extnum_t *idx,
struct xfs_bmbt_irec *got, struct xfs_bmbt_irec *del);
-int xfs_check_nostate_extents(struct xfs_ifork *ifp, xfs_extnum_t idx,
- xfs_extnum_t num);
uint xfs_default_attroffset(struct xfs_inode *ip);
int xfs_bmap_shift_extents(struct xfs_trans *tp, struct xfs_inode *ip,
xfs_fileoff_t *next_fsb, xfs_fileoff_t offset_shift_fsb,
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index fd55db479385..6cba69aff077 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -366,32 +366,6 @@ xfs_bmbt_to_bmdr(
memcpy(tpp, fpp, sizeof(*fpp) * dmxr);
}
-/*
- * Check extent records, which have just been read, for
- * any bit in the extent flag field. ASSERT on debug
- * kernels, as this condition should not occur.
- * Return an error condition (1) if any flags found,
- * otherwise return 0.
- */
-
-int
-xfs_check_nostate_extents(
- xfs_ifork_t *ifp,
- xfs_extnum_t idx,
- xfs_extnum_t num)
-{
- for (; num > 0; num--, idx++) {
- xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, idx);
- if ((ep->l0 >>
- (64 - BMBT_EXNTFLAG_BITLEN)) != 0) {
- ASSERT(0);
- return 1;
- }
- }
- return 0;
-}
-
-
STATIC struct xfs_btree_cur *
xfs_bmbt_dup_cursor(
struct xfs_btree_cur *cur)
@@ -448,7 +422,6 @@ xfs_bmbt_alloc_block(
if (args.fsbno == NULLFSBLOCK) {
args.fsbno = be64_to_cpu(start->l);
args.type = XFS_ALLOCTYPE_START_BNO;
-try_another_ag:
/*
* Make sure there is sufficient room left in the AG to
* complete a full tree split for an extent insert. If
@@ -477,22 +450,6 @@ try_another_ag:
if (error)
goto error0;
- /*
- * During a CoW operation, the allocation and bmbt updates occur in
- * different transactions. The mapping code tries to put new bmbt
- * blocks near extents being mapped, but the only way to guarantee this
- * is if the alloc and the mapping happen in a single transaction that
- * has a block reservation. That isn't the case here, so if we run out
- * of space we'll try again with another AG.
- */
- if (xfs_sb_version_hasreflink(&cur->bc_mp->m_sb) &&
- args.fsbno == NULLFSBLOCK &&
- args.type == XFS_ALLOCTYPE_NEAR_BNO) {
- args.fsbno = cur->bc_private.b.firstblock;
- args.type = XFS_ALLOCTYPE_FIRST_AG;
- goto try_another_ag;
- }
-
if (args.fsbno == NULLFSBLOCK && args.minleft) {
/*
* Could not find an AG with enough free space to satisfy
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.h b/fs/xfs/libxfs/xfs_bmap_btree.h
index 819a8a4dee95..9da5a8d4f184 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.h
+++ b/fs/xfs/libxfs/xfs_bmap_btree.h
@@ -25,14 +25,6 @@ struct xfs_inode;
struct xfs_trans;
/*
- * Extent state and extent format macros.
- */
-#define XFS_EXTFMT_INODE(x) \
- (xfs_sb_version_hasextflgbit(&((x)->i_mount->m_sb)) ? \
- XFS_EXTFMT_HASSTATE : XFS_EXTFMT_NOSTATE)
-#define ISUNWRITTEN(x) ((x)->br_state == XFS_EXT_UNWRITTEN)
-
-/*
* Btree block header size depends on a superblock flag.
*/
#define XFS_BMBT_BLOCK_LEN(mp) \
@@ -140,4 +132,18 @@ extern int xfs_bmbt_change_owner(struct xfs_trans *tp, struct xfs_inode *ip,
extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *,
struct xfs_trans *, struct xfs_inode *, int);
+/*
+ * Check that the extent does not contain an invalid unwritten extent flag.
+ */
+static inline bool xfs_bmbt_validate_extent(struct xfs_mount *mp, int whichfork,
+ struct xfs_bmbt_rec_host *ep)
+{
+ if (ep->l0 >> (64 - BMBT_EXNTFLAG_BITLEN) == 0)
+ return true;
+ if (whichfork == XFS_DATA_FORK &&
+ xfs_sb_version_hasextflgbit(&mp->m_sb))
+ return true;
+ return false;
+}
+
#endif /* __XFS_BMAP_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index c3decedc9455..5392674bf893 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -2886,7 +2886,7 @@ xfs_btree_split_worker(
struct xfs_btree_split_args *args = container_of(work,
struct xfs_btree_split_args, work);
unsigned long pflags;
- unsigned long new_pflags = PF_FSTRANS;
+ unsigned long new_pflags = PF_MEMALLOC_NOFS;
/*
* we are in a transaction context here, but may also be doing work
@@ -4842,6 +4842,21 @@ xfs_btree_query_range(
fn, priv);
}
+/* Query a btree for all records. */
+int
+xfs_btree_query_all(
+ struct xfs_btree_cur *cur,
+ xfs_btree_query_range_fn fn,
+ void *priv)
+{
+ union xfs_btree_irec low_rec;
+ union xfs_btree_irec high_rec;
+
+ memset(&low_rec, 0, sizeof(low_rec));
+ memset(&high_rec, 0xFF, sizeof(high_rec));
+ return xfs_btree_query_range(cur, &low_rec, &high_rec, fn, priv);
+}
+
/*
* Calculate the number of blocks needed to store a given number of records
* in a short-format (per-AG metadata) btree.
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index 4bb62580a7fd..27bed08261c5 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -496,6 +496,8 @@ typedef int (*xfs_btree_query_range_fn)(struct xfs_btree_cur *cur,
int xfs_btree_query_range(struct xfs_btree_cur *cur,
union xfs_btree_irec *low_rec, union xfs_btree_irec *high_rec,
xfs_btree_query_range_fn fn, void *priv);
+int xfs_btree_query_all(struct xfs_btree_cur *cur, xfs_btree_query_range_fn fn,
+ void *priv);
typedef int (*xfs_btree_visit_blocks_fn)(struct xfs_btree_cur *cur, int level,
void *data);
diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c
index ac9a003dd29a..747085b4ef44 100644
--- a/fs/xfs/libxfs/xfs_dquot_buf.c
+++ b/fs/xfs/libxfs/xfs_dquot_buf.c
@@ -35,13 +35,8 @@ int
xfs_calc_dquots_per_chunk(
unsigned int nbblks) /* basic block units */
{
- unsigned int ndquots;
-
ASSERT(nbblks > 0);
- ndquots = BBTOB(nbblks);
- do_div(ndquots, sizeof(xfs_dqblk_t));
-
- return ndquots;
+ return BBTOB(nbblks) / sizeof(xfs_dqblk_t);
}
/*
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 6b7579e7b60a..a1dccd8d96bc 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -930,10 +930,8 @@ static inline uint xfs_dinode_size(int version)
/*
* The 32 bit link count in the inode theoretically maxes out at UINT_MAX.
* Since the pathconf interface is signed, we use 2^31 - 1 instead.
- * The old inode format had a 16 bit link count, so its maximum is USHRT_MAX.
*/
#define XFS_MAXLINK ((1U << 31) - 1U)
-#define XFS_MAXLINK_1 65535U
/*
* Values for di_format
@@ -1578,19 +1576,10 @@ static inline xfs_filblks_t startblockval(xfs_fsblock_t x)
}
/*
- * Possible extent formats.
- */
-typedef enum {
- XFS_EXTFMT_NOSTATE = 0,
- XFS_EXTFMT_HASSTATE
-} xfs_exntfmt_t;
-
-/*
* Possible extent states.
*/
typedef enum {
XFS_EXT_NORM, XFS_EXT_UNWRITTEN,
- XFS_EXT_DMAPI_OFFLINE, XFS_EXT_INVALID
} xfs_exntst_t;
/*
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index b72dc821d78b..095bdf049a3f 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -92,6 +92,18 @@ struct getbmapx {
#define BMV_OF_LAST 0x4 /* segment is the last in the file */
#define BMV_OF_SHARED 0x8 /* segment shared with another file */
+/* fmr_owner special values for FS_IOC_GETFSMAP */
+#define XFS_FMR_OWN_FREE FMR_OWN_FREE /* free space */
+#define XFS_FMR_OWN_UNKNOWN FMR_OWN_UNKNOWN /* unknown owner */
+#define XFS_FMR_OWN_FS FMR_OWNER('X', 1) /* static fs metadata */
+#define XFS_FMR_OWN_LOG FMR_OWNER('X', 2) /* journalling log */
+#define XFS_FMR_OWN_AG FMR_OWNER('X', 3) /* per-AG metadata */
+#define XFS_FMR_OWN_INOBT FMR_OWNER('X', 4) /* inode btree blocks */
+#define XFS_FMR_OWN_INODES FMR_OWNER('X', 5) /* inodes */
+#define XFS_FMR_OWN_REFC FMR_OWNER('X', 6) /* refcount tree */
+#define XFS_FMR_OWN_COW FMR_OWNER('X', 7) /* cow staging */
+#define XFS_FMR_OWN_DEFECTIVE FMR_OWNER('X', 8) /* bad blocks */
+
/*
* Structure for XFS_IOC_FSSETDM.
* For use by backup and restore programs to set the XFS on-disk inode
@@ -502,6 +514,7 @@ typedef struct xfs_swapext
#define XFS_IOC_GETBMAPX _IOWR('X', 56, struct getbmap)
#define XFS_IOC_ZERO_RANGE _IOW ('X', 57, struct xfs_flock64)
#define XFS_IOC_FREE_EOFBLOCKS _IOR ('X', 58, struct xfs_fs_eofblocks)
+/* XFS_IOC_GETFSMAP ------ hoisted 59 */
/*
* ioctl commands that replace IRIX syssgi()'s
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index d93f9d918cfc..09c3d1aecef2 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -508,7 +508,7 @@ xfs_iread(
/* even unallocated inodes are verified */
if (!xfs_dinode_verify(mp, ip->i_ino, dip)) {
- xfs_alert(mp, "%s: validation failed for inode %lld failed",
+ xfs_alert(mp, "%s: validation failed for inode %lld",
__func__, ip->i_ino);
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, dip);
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index 8a37efe04de3..0e80f34fe97c 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -42,35 +42,6 @@ STATIC int xfs_iformat_local(xfs_inode_t *, xfs_dinode_t *, int, int);
STATIC int xfs_iformat_extents(xfs_inode_t *, xfs_dinode_t *, int);
STATIC int xfs_iformat_btree(xfs_inode_t *, xfs_dinode_t *, int);
-#ifdef DEBUG
-/*
- * Make sure that the extents in the given memory buffer
- * are valid.
- */
-void
-xfs_validate_extents(
- xfs_ifork_t *ifp,
- int nrecs,
- xfs_exntfmt_t fmt)
-{
- xfs_bmbt_irec_t irec;
- xfs_bmbt_rec_host_t rec;
- int i;
-
- for (i = 0; i < nrecs; i++) {
- xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
- rec.l0 = get_unaligned(&ep->l0);
- rec.l1 = get_unaligned(&ep->l1);
- xfs_bmbt_get_all(&rec, &irec);
- if (fmt == XFS_EXTFMT_NOSTATE)
- ASSERT(irec.br_state == XFS_EXT_NORM);
- }
-}
-#else /* DEBUG */
-#define xfs_validate_extents(ifp, nrecs, fmt)
-#endif /* DEBUG */
-
-
/*
* Move inode type and inode format specific information from the
* on-disk inode to the in-core inode. For fifos, devs, and sockets
@@ -352,40 +323,33 @@ xfs_iformat_local(
}
/*
- * The file consists of a set of extents all
- * of which fit into the on-disk inode.
- * If there are few enough extents to fit into
- * the if_inline_ext, then copy them there.
- * Otherwise allocate a buffer for them and copy
- * them into it. Either way, set if_extents
- * to point at the extents.
+ * The file consists of a set of extents all of which fit into the on-disk
+ * inode. If there are few enough extents to fit into the if_inline_ext, then
+ * copy them there. Otherwise allocate a buffer for them and copy them into it.
+ * Either way, set if_extents to point at the extents.
*/
STATIC int
xfs_iformat_extents(
- xfs_inode_t *ip,
- xfs_dinode_t *dip,
- int whichfork)
+ struct xfs_inode *ip,
+ struct xfs_dinode *dip,
+ int whichfork)
{
- xfs_bmbt_rec_t *dp;
- xfs_ifork_t *ifp;
- int nex;
- int size;
- int i;
-
- ifp = XFS_IFORK_PTR(ip, whichfork);
- nex = XFS_DFORK_NEXTENTS(dip, whichfork);
- size = nex * (uint)sizeof(xfs_bmbt_rec_t);
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ int nex = XFS_DFORK_NEXTENTS(dip, whichfork);
+ int size = nex * sizeof(xfs_bmbt_rec_t);
+ struct xfs_bmbt_rec *dp;
+ int i;
/*
- * If the number of extents is unreasonable, then something
- * is wrong and we just bail out rather than crash in
- * kmem_alloc() or memcpy() below.
+ * If the number of extents is unreasonable, then something is wrong and
+ * we just bail out rather than crash in kmem_alloc() or memcpy() below.
*/
- if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
+ if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, mp, whichfork))) {
xfs_warn(ip->i_mount, "corrupt inode %Lu ((a)extents = %d).",
(unsigned long long) ip->i_ino, nex);
XFS_CORRUPTION_ERROR("xfs_iformat_extents(1)", XFS_ERRLEVEL_LOW,
- ip->i_mount, dip);
+ mp, dip);
return -EFSCORRUPTED;
}
@@ -400,22 +364,17 @@ xfs_iformat_extents(
ifp->if_bytes = size;
if (size) {
dp = (xfs_bmbt_rec_t *) XFS_DFORK_PTR(dip, whichfork);
- xfs_validate_extents(ifp, nex, XFS_EXTFMT_INODE(ip));
for (i = 0; i < nex; i++, dp++) {
xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
ep->l0 = get_unaligned_be64(&dp->l0);
ep->l1 = get_unaligned_be64(&dp->l1);
+ if (!xfs_bmbt_validate_extent(mp, whichfork, ep)) {
+ XFS_ERROR_REPORT("xfs_iformat_extents(2)",
+ XFS_ERRLEVEL_LOW, mp);
+ return -EFSCORRUPTED;
+ }
}
XFS_BMAP_TRACE_EXLIST(ip, nex, whichfork);
- if (whichfork != XFS_DATA_FORK ||
- XFS_EXTFMT_INODE(ip) == XFS_EXTFMT_NOSTATE)
- if (unlikely(xfs_check_nostate_extents(
- ifp, 0, nex))) {
- XFS_ERROR_REPORT("xfs_iformat_extents(2)",
- XFS_ERRLEVEL_LOW,
- ip->i_mount);
- return -EFSCORRUPTED;
- }
}
ifp->if_flags |= XFS_IFEXTENTS;
return 0;
@@ -518,7 +477,6 @@ xfs_iread_extents(
xfs_iext_destroy(ifp);
return error;
}
- xfs_validate_extents(ifp, nextents, XFS_EXTFMT_INODE(ip));
ifp->if_flags |= XFS_IFEXTENTS;
return 0;
}
@@ -837,6 +795,9 @@ xfs_iextents_copy(
copied = 0;
for (i = 0; i < nrecs; i++) {
xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
+
+ ASSERT(xfs_bmbt_validate_extent(ip->i_mount, whichfork, ep));
+
start_block = xfs_bmbt_get_startblock(ep);
if (isnullstartblock(start_block)) {
/*
@@ -852,7 +813,6 @@ xfs_iextents_copy(
copied++;
}
ASSERT(copied != 0);
- xfs_validate_extents(ifp, copied, XFS_EXTFMT_INODE(ip));
return (copied * (uint)sizeof(xfs_bmbt_rec_t));
}
diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
index 3a8cc7139912..06cfb93c2ef9 100644
--- a/fs/xfs/libxfs/xfs_rmap.c
+++ b/fs/xfs/libxfs/xfs_rmap.c
@@ -2001,14 +2001,14 @@ xfs_rmap_query_range_helper(
/* Find all rmaps between two keys. */
int
xfs_rmap_query_range(
- struct xfs_btree_cur *cur,
- struct xfs_rmap_irec *low_rec,
- struct xfs_rmap_irec *high_rec,
- xfs_rmap_query_range_fn fn,
- void *priv)
+ struct xfs_btree_cur *cur,
+ struct xfs_rmap_irec *low_rec,
+ struct xfs_rmap_irec *high_rec,
+ xfs_rmap_query_range_fn fn,
+ void *priv)
{
- union xfs_btree_irec low_brec;
- union xfs_btree_irec high_brec;
+ union xfs_btree_irec low_brec;
+ union xfs_btree_irec high_brec;
struct xfs_rmap_query_range_info query;
low_brec.r = *low_rec;
@@ -2019,6 +2019,20 @@ xfs_rmap_query_range(
xfs_rmap_query_range_helper, &query);
}
+/* Find all rmaps. */
+int
+xfs_rmap_query_all(
+ struct xfs_btree_cur *cur,
+ xfs_rmap_query_range_fn fn,
+ void *priv)
+{
+ struct xfs_rmap_query_range_info query;
+
+ query.priv = priv;
+ query.fn = fn;
+ return xfs_btree_query_all(cur, xfs_rmap_query_range_helper, &query);
+}
+
/* Clean up after calling xfs_rmap_finish_one. */
void
xfs_rmap_finish_one_cleanup(
@@ -2291,3 +2305,31 @@ xfs_rmap_free_extent(
return __xfs_rmap_add(mp, dfops, XFS_RMAP_FREE, owner,
XFS_DATA_FORK, &bmap);
}
+
+/* Compare rmap records. Returns -1 if a < b, 1 if a > b, and 0 if equal. */
+int
+xfs_rmap_compare(
+ const struct xfs_rmap_irec *a,
+ const struct xfs_rmap_irec *b)
+{
+ __u64 oa;
+ __u64 ob;
+
+ oa = xfs_rmap_irec_offset_pack(a);
+ ob = xfs_rmap_irec_offset_pack(b);
+
+ if (a->rm_startblock < b->rm_startblock)
+ return -1;
+ else if (a->rm_startblock > b->rm_startblock)
+ return 1;
+ else if (a->rm_owner < b->rm_owner)
+ return -1;
+ else if (a->rm_owner > b->rm_owner)
+ return 1;
+ else if (oa < ob)
+ return -1;
+ else if (oa > ob)
+ return 1;
+ else
+ return 0;
+}
diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h
index 789930599339..98f908fea103 100644
--- a/fs/xfs/libxfs/xfs_rmap.h
+++ b/fs/xfs/libxfs/xfs_rmap.h
@@ -162,6 +162,8 @@ typedef int (*xfs_rmap_query_range_fn)(
int xfs_rmap_query_range(struct xfs_btree_cur *cur,
struct xfs_rmap_irec *low_rec, struct xfs_rmap_irec *high_rec,
xfs_rmap_query_range_fn fn, void *priv);
+int xfs_rmap_query_all(struct xfs_btree_cur *cur, xfs_rmap_query_range_fn fn,
+ void *priv);
enum xfs_rmap_intent_type {
XFS_RMAP_MAP,
@@ -212,5 +214,7 @@ int xfs_rmap_find_left_neighbor(struct xfs_btree_cur *cur, xfs_agblock_t bno,
int xfs_rmap_lookup_le_range(struct xfs_btree_cur *cur, xfs_agblock_t bno,
uint64_t owner, uint64_t offset, unsigned int flags,
struct xfs_rmap_irec *irec, int *stat);
+int xfs_rmap_compare(const struct xfs_rmap_irec *a,
+ const struct xfs_rmap_irec *b);
#endif /* __XFS_RMAP_H__ */
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
index ea45584a9913..e47b99e59f60 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -1016,3 +1016,73 @@ xfs_rtfree_extent(
}
return 0;
}
+
+/* Find all the free records within a given range. */
+int
+xfs_rtalloc_query_range(
+ struct xfs_trans *tp,
+ struct xfs_rtalloc_rec *low_rec,
+ struct xfs_rtalloc_rec *high_rec,
+ xfs_rtalloc_query_range_fn fn,
+ void *priv)
+{
+ struct xfs_rtalloc_rec rec;
+ struct xfs_mount *mp = tp->t_mountp;
+ xfs_rtblock_t rtstart;
+ xfs_rtblock_t rtend;
+ xfs_rtblock_t rem;
+ int is_free;
+ int error = 0;
+
+ if (low_rec->ar_startblock > high_rec->ar_startblock)
+ return -EINVAL;
+ else if (low_rec->ar_startblock == high_rec->ar_startblock)
+ return 0;
+
+ /* Iterate the bitmap, looking for discrepancies. */
+ rtstart = low_rec->ar_startblock;
+ rem = high_rec->ar_startblock - rtstart;
+ while (rem) {
+ /* Is the first block free? */
+ error = xfs_rtcheck_range(mp, tp, rtstart, 1, 1, &rtend,
+ &is_free);
+ if (error)
+ break;
+
+ /* How long does the extent go for? */
+ error = xfs_rtfind_forw(mp, tp, rtstart,
+ high_rec->ar_startblock - 1, &rtend);
+ if (error)
+ break;
+
+ if (is_free) {
+ rec.ar_startblock = rtstart;
+ rec.ar_blockcount = rtend - rtstart + 1;
+
+ error = fn(tp, &rec, priv);
+ if (error)
+ break;
+ }
+
+ rem -= rtend - rtstart + 1;
+ rtstart = rtend + 1;
+ }
+
+ return error;
+}
+
+/* Find all the free records. */
+int
+xfs_rtalloc_query_all(
+ struct xfs_trans *tp,
+ xfs_rtalloc_query_range_fn fn,
+ void *priv)
+{
+ struct xfs_rtalloc_rec keys[2];
+
+ keys[0].ar_startblock = 0;
+ keys[1].ar_startblock = tp->t_mountp->m_sb.sb_rblocks;
+ keys[0].ar_blockcount = keys[1].ar_blockcount = 0;
+
+ return xfs_rtalloc_query_range(tp, &keys[0], &keys[1], fn, priv);
+}
diff --git a/fs/xfs/libxfs/xfs_trans_space.h b/fs/xfs/libxfs/xfs_trans_space.h
index 7917f6e44286..d787c677d2a3 100644
--- a/fs/xfs/libxfs/xfs_trans_space.h
+++ b/fs/xfs/libxfs/xfs_trans_space.h
@@ -21,8 +21,20 @@
/*
* Components of space reservations.
*/
+
+/* Worst case number of rmaps that can be held in a block. */
#define XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp) \
(((mp)->m_rmap_mxr[0]) - ((mp)->m_rmap_mnr[0]))
+
+/* Adding one rmap could split every level up to the top of the tree. */
+#define XFS_RMAPADD_SPACE_RES(mp) ((mp)->m_rmap_maxlevels)
+
+/* Blocks we might need to add "b" rmaps to a tree. */
+#define XFS_NRMAPADD_SPACE_RES(mp, b)\
+ (((b + XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp) - 1) / \
+ XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp)) * \
+ XFS_RMAPADD_SPACE_RES(mp))
+
#define XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp) \
(((mp)->m_alloc_mxr[0]) - ((mp)->m_alloc_mnr[0]))
#define XFS_EXTENTADD_SPACE_RES(mp,w) (XFS_BM_MAXLEVELS(mp,w) - 1)
@@ -30,13 +42,12 @@
(((b + XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp) - 1) / \
XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp)) * \
XFS_EXTENTADD_SPACE_RES(mp,w))
+
+/* Blocks we might need to add "b" mappings & rmappings to a file. */
#define XFS_SWAP_RMAP_SPACE_RES(mp,b,w)\
- (((b + XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp) - 1) / \
- XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp)) * \
- XFS_EXTENTADD_SPACE_RES(mp,w) + \
- ((b + XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp) - 1) / \
- XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp)) * \
- (mp)->m_rmap_maxlevels)
+ (XFS_NEXTENTADD_SPACE_RES((mp), (b), (w)) + \
+ XFS_NRMAPADD_SPACE_RES((mp), (b)))
+
#define XFS_DAENTER_1B(mp,w) \
((w) == XFS_DATA_FORK ? (mp)->m_dir_geo->fsbcount : 1)
#define XFS_DAENTER_DBS(mp,w) \
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 61494295d92f..09af0f7cd55e 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -111,11 +111,11 @@ xfs_finish_page_writeback(
bsize = bh->b_size;
do {
+ if (off > end)
+ break;
next = bh->b_this_page;
if (off < bvec->bv_offset)
goto next_bh;
- if (off > end)
- break;
bh->b_end_io(bh, !error);
next_bh:
off += bsize;
@@ -189,7 +189,7 @@ xfs_setfilesize_trans_alloc(
* We hand off the transaction to the completion thread now, so
* clear the flag here.
*/
- current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
+ current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
return 0;
}
@@ -252,7 +252,7 @@ xfs_setfilesize_ioend(
* thus we need to mark ourselves as being in a transaction manually.
* Similarly for freeze protection.
*/
- current_set_flags_nested(&tp->t_pflags, PF_FSTRANS);
+ current_set_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
__sb_writers_acquired(VFS_I(ip)->i_sb, SB_FREEZE_FS);
/* we abort the update if there was an IO error */
@@ -1016,7 +1016,7 @@ xfs_do_writepage(
* Given that we do not allow direct reclaim to call us, we should
* never be called while in a filesystem transaction.
*/
- if (WARN_ON_ONCE(current->flags & PF_FSTRANS))
+ if (WARN_ON_ONCE(current->flags & PF_MEMALLOC_NOFS))
goto redirty;
/*
@@ -1261,8 +1261,8 @@ xfs_get_blocks(
if (nimaps) {
trace_xfs_get_blocks_found(ip, offset, size,
- ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN
- : XFS_IO_OVERWRITE, &imap);
+ imap.br_state == XFS_EXT_UNWRITTEN ?
+ XFS_IO_UNWRITTEN : XFS_IO_OVERWRITE, &imap);
xfs_iunlock(ip, lockmode);
} else {
trace_xfs_get_blocks_notfound(ip, offset, size);
@@ -1276,9 +1276,7 @@ xfs_get_blocks(
* For unwritten extents do not report a disk address in the buffered
* read case (treat as if we're reading into a hole).
*/
- if (imap.br_startblock != HOLESTARTBLOCK &&
- imap.br_startblock != DELAYSTARTBLOCK &&
- !ISUNWRITTEN(&imap))
+ if (xfs_bmap_is_real_extent(&imap))
xfs_map_buffer(inode, bh_result, &imap, offset);
/*
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
index 9bf57c76623b..d419d23fa214 100644
--- a/fs/xfs/xfs_bmap_item.c
+++ b/fs/xfs/xfs_bmap_item.c
@@ -34,6 +34,8 @@
#include "xfs_bmap.h"
#include "xfs_icache.h"
#include "xfs_trace.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_trans_space.h"
kmem_zone_t *xfs_bui_zone;
@@ -215,6 +217,7 @@ void
xfs_bui_release(
struct xfs_bui_log_item *buip)
{
+ ASSERT(atomic_read(&buip->bui_refcount) > 0);
if (atomic_dec_and_test(&buip->bui_refcount)) {
xfs_trans_ail_remove(&buip->bui_item, SHUTDOWN_LOG_IO_ERROR);
xfs_bui_item_free(buip);
@@ -446,7 +449,8 @@ xfs_bui_recover(
return -EIO;
}
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate,
+ XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK), 0, 0, &tp);
if (error)
return error;
budp = xfs_trans_get_bud(tp, buip);
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 828532ce0adc..2b954308a1d6 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -81,7 +81,7 @@ xfs_zero_extent(
return blkdev_issue_zeroout(xfs_find_bdev_for_inode(VFS_I(ip)),
block << (mp->m_super->s_blocksize_bits - 9),
count_fsb << (mp->m_super->s_blocksize_bits - 9),
- GFP_NOFS, true);
+ GFP_NOFS, 0);
}
int
@@ -448,10 +448,9 @@ xfs_getbmap_adjust_shared(
next_map->br_blockcount = 0;
/* Only written data blocks can be shared. */
- if (!xfs_is_reflink_inode(ip) || whichfork != XFS_DATA_FORK ||
- map->br_startblock == DELAYSTARTBLOCK ||
- map->br_startblock == HOLESTARTBLOCK ||
- ISUNWRITTEN(map))
+ if (!xfs_is_reflink_inode(ip) ||
+ whichfork != XFS_DATA_FORK ||
+ !xfs_bmap_is_real_extent(map))
return 0;
agno = XFS_FSB_TO_AGNO(mp, map->br_startblock);
@@ -904,9 +903,9 @@ xfs_can_free_eofblocks(struct xfs_inode *ip, bool force)
}
/*
- * This is called by xfs_inactive to free any blocks beyond eof
- * when the link count isn't zero and by xfs_dm_punch_hole() when
- * punching a hole to EOF.
+ * This is called to free any blocks beyond eof. The caller must hold
+ * IOLOCK_EXCL unless we are in the inode reclaim path and have the only
+ * reference to the inode.
*/
int
xfs_free_eofblocks(
@@ -921,8 +920,6 @@ xfs_free_eofblocks(
struct xfs_bmbt_irec imap;
struct xfs_mount *mp = ip->i_mount;
- ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
-
/*
* Figure out if there are any blocks beyond the end
* of the file. If not, then there is nothing to do.
@@ -1209,11 +1206,8 @@ xfs_adjust_extent_unmap_boundaries(
return error;
if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
- xfs_daddr_t block;
-
ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
- block = imap.br_startblock;
- mod = do_div(block, mp->m_sb.sb_rextsize);
+ mod = do_mod(imap.br_startblock, mp->m_sb.sb_rextsize);
if (mod)
*startoffset_fsb += mp->m_sb.sb_rextsize - mod;
}
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index b6208728ba39..62fa39276a24 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -443,17 +443,17 @@ _xfs_buf_map_pages(
bp->b_addr = NULL;
} else {
int retried = 0;
- unsigned noio_flag;
+ unsigned nofs_flag;
/*
* vm_map_ram() will allocate auxillary structures (e.g.
* pagetables) with GFP_KERNEL, yet we are likely to be under
* GFP_NOFS context here. Hence we need to tell memory reclaim
- * that we are in such a context via PF_MEMALLOC_NOIO to prevent
+ * that we are in such a context via PF_MEMALLOC_NOFS to prevent
* memory reclaim re-entering the filesystem here and
* potentially deadlocking.
*/
- noio_flag = memalloc_noio_save();
+ nofs_flag = memalloc_nofs_save();
do {
bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
-1, PAGE_KERNEL);
@@ -461,7 +461,7 @@ _xfs_buf_map_pages(
break;
vm_unmap_aliases();
} while (retried++ <= 1);
- memalloc_noio_restore(noio_flag);
+ memalloc_nofs_restore(nofs_flag);
if (!bp->b_addr)
return -ENOMEM;
@@ -1079,6 +1079,8 @@ void
xfs_buf_unlock(
struct xfs_buf *bp)
{
+ ASSERT(xfs_buf_islocked(bp));
+
XB_CLEAR_OWNER(bp);
up(&bp->b_sema);
@@ -1815,6 +1817,28 @@ error:
}
/*
+ * Cancel a delayed write list.
+ *
+ * Remove each buffer from the list, clear the delwri queue flag and drop the
+ * associated buffer reference.
+ */
+void
+xfs_buf_delwri_cancel(
+ struct list_head *list)
+{
+ struct xfs_buf *bp;
+
+ while (!list_empty(list)) {
+ bp = list_first_entry(list, struct xfs_buf, b_list);
+
+ xfs_buf_lock(bp);
+ bp->b_flags &= ~_XBF_DELWRI_Q;
+ list_del_init(&bp->b_list);
+ xfs_buf_relse(bp);
+ }
+}
+
+/*
* Add a buffer to the delayed write list.
*
* This queues a buffer for writeout if it hasn't already been. Note that
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 3c867e5a63e1..8d1d44f87ce9 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -291,7 +291,6 @@ xfs_buf_readahead(
return xfs_buf_readahead_map(target, &map, 1, ops);
}
-struct xfs_buf *xfs_buf_get_empty(struct xfs_buftarg *target, size_t numblks);
void xfs_buf_set_empty(struct xfs_buf *bp, size_t numblks);
int xfs_buf_associate_memory(struct xfs_buf *bp, void *mem, size_t length);
@@ -330,6 +329,7 @@ extern void *xfs_buf_offset(struct xfs_buf *, size_t);
extern void xfs_buf_stale(struct xfs_buf *bp);
/* Delayed Write Buffer Routines */
+extern void xfs_buf_delwri_cancel(struct list_head *);
extern bool xfs_buf_delwri_queue(struct xfs_buf *, struct list_head *);
extern int xfs_buf_delwri_submit(struct list_head *);
extern int xfs_buf_delwri_submit_nowait(struct list_head *);
diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c
index ad9396e516f6..20b7a5c6eb2f 100644
--- a/fs/xfs/xfs_dir2_readdir.c
+++ b/fs/xfs/xfs_dir2_readdir.c
@@ -394,6 +394,7 @@ xfs_dir2_leaf_readbuf(
/*
* Do we need more readahead?
+ * Each loop tries to process 1 full dir blk; last may be partial.
*/
blk_start_plug(&plug);
for (mip->ra_index = mip->ra_offset = i = 0;
@@ -404,7 +405,8 @@ xfs_dir2_leaf_readbuf(
* Read-ahead a contiguous directory block.
*/
if (i > mip->ra_current &&
- map[mip->ra_index].br_blockcount >= geo->fsbcount) {
+ (map[mip->ra_index].br_blockcount - mip->ra_offset) >=
+ geo->fsbcount) {
xfs_dir3_data_readahead(dp,
map[mip->ra_index].br_startoff + mip->ra_offset,
XFS_FSB_TO_DADDR(dp->i_mount,
@@ -425,14 +427,19 @@ xfs_dir2_leaf_readbuf(
}
/*
- * Advance offset through the mapping table.
+ * Advance offset through the mapping table, processing a full
+ * dir block even if it is fragmented into several extents.
+ * But stop if we have consumed all valid mappings, even if
+ * it's not yet a full directory block.
*/
- for (j = 0; j < geo->fsbcount; j += length ) {
+ for (j = 0;
+ j < geo->fsbcount && mip->ra_index < mip->map_valid;
+ j += length ) {
/*
* The rest of this extent but not more than a dir
* block.
*/
- length = min_t(int, geo->fsbcount,
+ length = min_t(int, geo->fsbcount - j,
map[mip->ra_index].br_blockcount -
mip->ra_offset);
mip->ra_offset += length;
diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
index d796ffac7296..6a05d278da64 100644
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -132,6 +132,11 @@ next_extent:
error = xfs_btree_decrement(cur, 0, &i);
if (error)
goto out_del_cursor;
+
+ if (fatal_signal_pending(current)) {
+ error = -ERESTARTSYS;
+ goto out_del_cursor;
+ }
}
out_del_cursor:
@@ -196,8 +201,11 @@ xfs_ioc_trim(
for (agno = start_agno; agno <= end_agno; agno++) {
error = xfs_trim_extents(mp, agno, start, end, minlen,
&blocks_trimmed);
- if (error)
+ if (error) {
last_error = error;
+ if (error == -ERESTARTSYS)
+ break;
+ }
}
if (last_error)
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index d7bc14906af8..44f8c5451210 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -290,6 +290,7 @@ void
xfs_efi_release(
struct xfs_efi_log_item *efip)
{
+ ASSERT(atomic_read(&efip->efi_refcount) > 0);
if (atomic_dec_and_test(&efip->efi_refcount)) {
xfs_trans_ail_remove(&efip->efi_item, SHUTDOWN_LOG_IO_ERROR);
xfs_efi_item_free(efip);
diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c
new file mode 100644
index 000000000000..3683819887a5
--- /dev/null
+++ b/fs/xfs/xfs_fsmap.c
@@ -0,0 +1,940 @@
+/*
+ * Copyright (C) 2017 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_error.h"
+#include "xfs_btree.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_trace.h"
+#include "xfs_log.h"
+#include "xfs_rmap.h"
+#include "xfs_alloc.h"
+#include "xfs_bit.h"
+#include <linux/fsmap.h>
+#include "xfs_fsmap.h"
+#include "xfs_refcount.h"
+#include "xfs_refcount_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_rtalloc.h"
+
+/* Convert an xfs_fsmap to an fsmap. */
+void
+xfs_fsmap_from_internal(
+ struct fsmap *dest,
+ struct xfs_fsmap *src)
+{
+ dest->fmr_device = src->fmr_device;
+ dest->fmr_flags = src->fmr_flags;
+ dest->fmr_physical = BBTOB(src->fmr_physical);
+ dest->fmr_owner = src->fmr_owner;
+ dest->fmr_offset = BBTOB(src->fmr_offset);
+ dest->fmr_length = BBTOB(src->fmr_length);
+ dest->fmr_reserved[0] = 0;
+ dest->fmr_reserved[1] = 0;
+ dest->fmr_reserved[2] = 0;
+}
+
+/* Convert an fsmap to an xfs_fsmap. */
+void
+xfs_fsmap_to_internal(
+ struct xfs_fsmap *dest,
+ struct fsmap *src)
+{
+ dest->fmr_device = src->fmr_device;
+ dest->fmr_flags = src->fmr_flags;
+ dest->fmr_physical = BTOBBT(src->fmr_physical);
+ dest->fmr_owner = src->fmr_owner;
+ dest->fmr_offset = BTOBBT(src->fmr_offset);
+ dest->fmr_length = BTOBBT(src->fmr_length);
+}
+
+/* Convert an fsmap owner into an rmapbt owner. */
+static int
+xfs_fsmap_owner_to_rmap(
+ struct xfs_rmap_irec *dest,
+ struct xfs_fsmap *src)
+{
+ if (!(src->fmr_flags & FMR_OF_SPECIAL_OWNER)) {
+ dest->rm_owner = src->fmr_owner;
+ return 0;
+ }
+
+ switch (src->fmr_owner) {
+ case 0: /* "lowest owner id possible" */
+ case -1ULL: /* "highest owner id possible" */
+ dest->rm_owner = 0;
+ break;
+ case XFS_FMR_OWN_FREE:
+ dest->rm_owner = XFS_RMAP_OWN_NULL;
+ break;
+ case XFS_FMR_OWN_UNKNOWN:
+ dest->rm_owner = XFS_RMAP_OWN_UNKNOWN;
+ break;
+ case XFS_FMR_OWN_FS:
+ dest->rm_owner = XFS_RMAP_OWN_FS;
+ break;
+ case XFS_FMR_OWN_LOG:
+ dest->rm_owner = XFS_RMAP_OWN_LOG;
+ break;
+ case XFS_FMR_OWN_AG:
+ dest->rm_owner = XFS_RMAP_OWN_AG;
+ break;
+ case XFS_FMR_OWN_INOBT:
+ dest->rm_owner = XFS_RMAP_OWN_INOBT;
+ break;
+ case XFS_FMR_OWN_INODES:
+ dest->rm_owner = XFS_RMAP_OWN_INODES;
+ break;
+ case XFS_FMR_OWN_REFC:
+ dest->rm_owner = XFS_RMAP_OWN_REFC;
+ break;
+ case XFS_FMR_OWN_COW:
+ dest->rm_owner = XFS_RMAP_OWN_COW;
+ break;
+ case XFS_FMR_OWN_DEFECTIVE: /* not implemented */
+ /* fall through */
+ default:
+ return -EINVAL;
+ }
+ return 0;
+}
+
+/* Convert an rmapbt owner into an fsmap owner. */
+static int
+xfs_fsmap_owner_from_rmap(
+ struct xfs_fsmap *dest,
+ struct xfs_rmap_irec *src)
+{
+ dest->fmr_flags = 0;
+ if (!XFS_RMAP_NON_INODE_OWNER(src->rm_owner)) {
+ dest->fmr_owner = src->rm_owner;
+ return 0;
+ }
+ dest->fmr_flags |= FMR_OF_SPECIAL_OWNER;
+
+ switch (src->rm_owner) {
+ case XFS_RMAP_OWN_FS:
+ dest->fmr_owner = XFS_FMR_OWN_FS;
+ break;
+ case XFS_RMAP_OWN_LOG:
+ dest->fmr_owner = XFS_FMR_OWN_LOG;
+ break;
+ case XFS_RMAP_OWN_AG:
+ dest->fmr_owner = XFS_FMR_OWN_AG;
+ break;
+ case XFS_RMAP_OWN_INOBT:
+ dest->fmr_owner = XFS_FMR_OWN_INOBT;
+ break;
+ case XFS_RMAP_OWN_INODES:
+ dest->fmr_owner = XFS_FMR_OWN_INODES;
+ break;
+ case XFS_RMAP_OWN_REFC:
+ dest->fmr_owner = XFS_FMR_OWN_REFC;
+ break;
+ case XFS_RMAP_OWN_COW:
+ dest->fmr_owner = XFS_FMR_OWN_COW;
+ break;
+ case XFS_RMAP_OWN_NULL: /* "free" */
+ dest->fmr_owner = XFS_FMR_OWN_FREE;
+ break;
+ default:
+ return -EFSCORRUPTED;
+ }
+ return 0;
+}
+
+/* getfsmap query state */
+struct xfs_getfsmap_info {
+ struct xfs_fsmap_head *head;
+ xfs_fsmap_format_t formatter; /* formatting fn */
+ void *format_arg; /* format buffer */
+ struct xfs_buf *agf_bp; /* AGF, for refcount queries */
+ xfs_daddr_t next_daddr; /* next daddr we expect */
+ u64 missing_owner; /* owner of holes */
+ u32 dev; /* device id */
+ xfs_agnumber_t agno; /* AG number, if applicable */
+ struct xfs_rmap_irec low; /* low rmap key */
+ struct xfs_rmap_irec high; /* high rmap key */
+ bool last; /* last extent? */
+};
+
+/* Associate a device with a getfsmap handler. */
+struct xfs_getfsmap_dev {
+ u32 dev;
+ int (*fn)(struct xfs_trans *tp,
+ struct xfs_fsmap *keys,
+ struct xfs_getfsmap_info *info);
+};
+
+/* Compare two getfsmap device handlers. */
+static int
+xfs_getfsmap_dev_compare(
+ const void *p1,
+ const void *p2)
+{
+ const struct xfs_getfsmap_dev *d1 = p1;
+ const struct xfs_getfsmap_dev *d2 = p2;
+
+ return d1->dev - d2->dev;
+}
+
+/* Decide if this mapping is shared. */
+STATIC int
+xfs_getfsmap_is_shared(
+ struct xfs_trans *tp,
+ struct xfs_getfsmap_info *info,
+ struct xfs_rmap_irec *rec,
+ bool *stat)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_btree_cur *cur;
+ xfs_agblock_t fbno;
+ xfs_extlen_t flen;
+ int error;
+
+ *stat = false;
+ if (!xfs_sb_version_hasreflink(&mp->m_sb))
+ return 0;
+ /* rt files will have agno set to NULLAGNUMBER */
+ if (info->agno == NULLAGNUMBER)
+ return 0;
+
+ /* Are there any shared blocks here? */
+ flen = 0;
+ cur = xfs_refcountbt_init_cursor(mp, tp, info->agf_bp,
+ info->agno, NULL);
+
+ error = xfs_refcount_find_shared(cur, rec->rm_startblock,
+ rec->rm_blockcount, &fbno, &flen, false);
+
+ xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+ if (error)
+ return error;
+
+ *stat = flen > 0;
+ return 0;
+}
+
+/*
+ * Format a reverse mapping for getfsmap, having translated rm_startblock
+ * into the appropriate daddr units.
+ */
+STATIC int
+xfs_getfsmap_helper(
+ struct xfs_trans *tp,
+ struct xfs_getfsmap_info *info,
+ struct xfs_rmap_irec *rec,
+ xfs_daddr_t rec_daddr)
+{
+ struct xfs_fsmap fmr;
+ struct xfs_mount *mp = tp->t_mountp;
+ bool shared;
+ int error;
+
+ if (fatal_signal_pending(current))
+ return -EINTR;
+
+ /*
+ * Filter out records that start before our startpoint, if the
+ * caller requested that.
+ */
+ if (xfs_rmap_compare(rec, &info->low) < 0) {
+ rec_daddr += XFS_FSB_TO_BB(mp, rec->rm_blockcount);
+ if (info->next_daddr < rec_daddr)
+ info->next_daddr = rec_daddr;
+ return XFS_BTREE_QUERY_RANGE_CONTINUE;
+ }
+
+ /* Are we just counting mappings? */
+ if (info->head->fmh_count == 0) {
+ if (rec_daddr > info->next_daddr)
+ info->head->fmh_entries++;
+
+ if (info->last)
+ return XFS_BTREE_QUERY_RANGE_CONTINUE;
+
+ info->head->fmh_entries++;
+
+ rec_daddr += XFS_FSB_TO_BB(mp, rec->rm_blockcount);
+ if (info->next_daddr < rec_daddr)
+ info->next_daddr = rec_daddr;
+ return XFS_BTREE_QUERY_RANGE_CONTINUE;
+ }
+
+ /*
+ * If the record starts past the last physical block we saw,
+ * then we've found a gap. Report the gap as being owned by
+ * whatever the caller specified is the missing owner.
+ */
+ if (rec_daddr > info->next_daddr) {
+ if (info->head->fmh_entries >= info->head->fmh_count)
+ return XFS_BTREE_QUERY_RANGE_ABORT;
+
+ fmr.fmr_device = info->dev;
+ fmr.fmr_physical = info->next_daddr;
+ fmr.fmr_owner = info->missing_owner;
+ fmr.fmr_offset = 0;
+ fmr.fmr_length = rec_daddr - info->next_daddr;
+ fmr.fmr_flags = FMR_OF_SPECIAL_OWNER;
+ error = info->formatter(&fmr, info->format_arg);
+ if (error)
+ return error;
+ info->head->fmh_entries++;
+ }
+
+ if (info->last)
+ goto out;
+
+ /* Fill out the extent we found */
+ if (info->head->fmh_entries >= info->head->fmh_count)
+ return XFS_BTREE_QUERY_RANGE_ABORT;
+
+ trace_xfs_fsmap_mapping(mp, info->dev, info->agno, rec);
+
+ fmr.fmr_device = info->dev;
+ fmr.fmr_physical = rec_daddr;
+ error = xfs_fsmap_owner_from_rmap(&fmr, rec);
+ if (error)
+ return error;
+ fmr.fmr_offset = XFS_FSB_TO_BB(mp, rec->rm_offset);
+ fmr.fmr_length = XFS_FSB_TO_BB(mp, rec->rm_blockcount);
+ if (rec->rm_flags & XFS_RMAP_UNWRITTEN)
+ fmr.fmr_flags |= FMR_OF_PREALLOC;
+ if (rec->rm_flags & XFS_RMAP_ATTR_FORK)
+ fmr.fmr_flags |= FMR_OF_ATTR_FORK;
+ if (rec->rm_flags & XFS_RMAP_BMBT_BLOCK)
+ fmr.fmr_flags |= FMR_OF_EXTENT_MAP;
+ if (fmr.fmr_flags == 0) {
+ error = xfs_getfsmap_is_shared(tp, info, rec, &shared);
+ if (error)
+ return error;
+ if (shared)
+ fmr.fmr_flags |= FMR_OF_SHARED;
+ }
+ error = info->formatter(&fmr, info->format_arg);
+ if (error)
+ return error;
+ info->head->fmh_entries++;
+
+out:
+ rec_daddr += XFS_FSB_TO_BB(mp, rec->rm_blockcount);
+ if (info->next_daddr < rec_daddr)
+ info->next_daddr = rec_daddr;
+ return XFS_BTREE_QUERY_RANGE_CONTINUE;
+}
+
+/* Transform a rmapbt irec into a fsmap */
+STATIC int
+xfs_getfsmap_datadev_helper(
+ struct xfs_btree_cur *cur,
+ struct xfs_rmap_irec *rec,
+ void *priv)
+{
+ struct xfs_mount *mp = cur->bc_mp;
+ struct xfs_getfsmap_info *info = priv;
+ xfs_fsblock_t fsb;
+ xfs_daddr_t rec_daddr;
+
+ fsb = XFS_AGB_TO_FSB(mp, cur->bc_private.a.agno, rec->rm_startblock);
+ rec_daddr = XFS_FSB_TO_DADDR(mp, fsb);
+
+ return xfs_getfsmap_helper(cur->bc_tp, info, rec, rec_daddr);
+}
+
+/* Transform a rtbitmap "record" into a fsmap */
+STATIC int
+xfs_getfsmap_rtdev_rtbitmap_helper(
+ struct xfs_trans *tp,
+ struct xfs_rtalloc_rec *rec,
+ void *priv)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_getfsmap_info *info = priv;
+ struct xfs_rmap_irec irec;
+ xfs_daddr_t rec_daddr;
+
+ rec_daddr = XFS_FSB_TO_BB(mp, rec->ar_startblock);
+
+ irec.rm_startblock = rec->ar_startblock;
+ irec.rm_blockcount = rec->ar_blockcount;
+ irec.rm_owner = XFS_RMAP_OWN_NULL; /* "free" */
+ irec.rm_offset = 0;
+ irec.rm_flags = 0;
+
+ return xfs_getfsmap_helper(tp, info, &irec, rec_daddr);
+}
+
+/* Transform a bnobt irec into a fsmap */
+STATIC int
+xfs_getfsmap_datadev_bnobt_helper(
+ struct xfs_btree_cur *cur,
+ struct xfs_alloc_rec_incore *rec,
+ void *priv)
+{
+ struct xfs_mount *mp = cur->bc_mp;
+ struct xfs_getfsmap_info *info = priv;
+ struct xfs_rmap_irec irec;
+ xfs_daddr_t rec_daddr;
+
+ rec_daddr = XFS_AGB_TO_DADDR(mp, cur->bc_private.a.agno,
+ rec->ar_startblock);
+
+ irec.rm_startblock = rec->ar_startblock;
+ irec.rm_blockcount = rec->ar_blockcount;
+ irec.rm_owner = XFS_RMAP_OWN_NULL; /* "free" */
+ irec.rm_offset = 0;
+ irec.rm_flags = 0;
+
+ return xfs_getfsmap_helper(cur->bc_tp, info, &irec, rec_daddr);
+}
+
+/* Set rmap flags based on the getfsmap flags */
+static void
+xfs_getfsmap_set_irec_flags(
+ struct xfs_rmap_irec *irec,
+ struct xfs_fsmap *fmr)
+{
+ irec->rm_flags = 0;
+ if (fmr->fmr_flags & FMR_OF_ATTR_FORK)
+ irec->rm_flags |= XFS_RMAP_ATTR_FORK;
+ if (fmr->fmr_flags & FMR_OF_EXTENT_MAP)
+ irec->rm_flags |= XFS_RMAP_BMBT_BLOCK;
+ if (fmr->fmr_flags & FMR_OF_PREALLOC)
+ irec->rm_flags |= XFS_RMAP_UNWRITTEN;
+}
+
+/* Execute a getfsmap query against the log device. */
+STATIC int
+xfs_getfsmap_logdev(
+ struct xfs_trans *tp,
+ struct xfs_fsmap *keys,
+ struct xfs_getfsmap_info *info)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_rmap_irec rmap;
+ int error;
+
+ /* Set up search keys */
+ info->low.rm_startblock = XFS_BB_TO_FSBT(mp, keys[0].fmr_physical);
+ info->low.rm_offset = XFS_BB_TO_FSBT(mp, keys[0].fmr_offset);
+ error = xfs_fsmap_owner_to_rmap(&info->low, keys);
+ if (error)
+ return error;
+ info->low.rm_blockcount = 0;
+ xfs_getfsmap_set_irec_flags(&info->low, &keys[0]);
+
+ error = xfs_fsmap_owner_to_rmap(&info->high, keys + 1);
+ if (error)
+ return error;
+ info->high.rm_startblock = -1U;
+ info->high.rm_owner = ULLONG_MAX;
+ info->high.rm_offset = ULLONG_MAX;
+ info->high.rm_blockcount = 0;
+ info->high.rm_flags = XFS_RMAP_KEY_FLAGS | XFS_RMAP_REC_FLAGS;
+ info->missing_owner = XFS_FMR_OWN_FREE;
+
+ trace_xfs_fsmap_low_key(mp, info->dev, info->agno, &info->low);
+ trace_xfs_fsmap_high_key(mp, info->dev, info->agno, &info->high);
+
+ if (keys[0].fmr_physical > 0)
+ return 0;
+
+ /* Fabricate an rmap entry for the external log device. */
+ rmap.rm_startblock = 0;
+ rmap.rm_blockcount = mp->m_sb.sb_logblocks;
+ rmap.rm_owner = XFS_RMAP_OWN_LOG;
+ rmap.rm_offset = 0;
+ rmap.rm_flags = 0;
+
+ return xfs_getfsmap_helper(tp, info, &rmap, 0);
+}
+
+/* Execute a getfsmap query against the realtime device. */
+STATIC int
+__xfs_getfsmap_rtdev(
+ struct xfs_trans *tp,
+ struct xfs_fsmap *keys,
+ int (*query_fn)(struct xfs_trans *,
+ struct xfs_getfsmap_info *),
+ struct xfs_getfsmap_info *info)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ xfs_fsblock_t start_fsb;
+ xfs_fsblock_t end_fsb;
+ xfs_daddr_t eofs;
+ int error = 0;
+
+ eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks);
+ if (keys[0].fmr_physical >= eofs)
+ return 0;
+ if (keys[1].fmr_physical >= eofs)
+ keys[1].fmr_physical = eofs - 1;
+ start_fsb = XFS_BB_TO_FSBT(mp, keys[0].fmr_physical);
+ end_fsb = XFS_BB_TO_FSB(mp, keys[1].fmr_physical);
+
+ /* Set up search keys */
+ info->low.rm_startblock = start_fsb;
+ error = xfs_fsmap_owner_to_rmap(&info->low, &keys[0]);
+ if (error)
+ return error;
+ info->low.rm_offset = XFS_BB_TO_FSBT(mp, keys[0].fmr_offset);
+ info->low.rm_blockcount = 0;
+ xfs_getfsmap_set_irec_flags(&info->low, &keys[0]);
+
+ info->high.rm_startblock = end_fsb;
+ error = xfs_fsmap_owner_to_rmap(&info->high, &keys[1]);
+ if (error)
+ return error;
+ info->high.rm_offset = XFS_BB_TO_FSBT(mp, keys[1].fmr_offset);
+ info->high.rm_blockcount = 0;
+ xfs_getfsmap_set_irec_flags(&info->high, &keys[1]);
+
+ trace_xfs_fsmap_low_key(mp, info->dev, info->agno, &info->low);
+ trace_xfs_fsmap_high_key(mp, info->dev, info->agno, &info->high);
+
+ return query_fn(tp, info);
+}
+
+/* Actually query the realtime bitmap. */
+STATIC int
+xfs_getfsmap_rtdev_rtbitmap_query(
+ struct xfs_trans *tp,
+ struct xfs_getfsmap_info *info)
+{
+ struct xfs_rtalloc_rec alow;
+ struct xfs_rtalloc_rec ahigh;
+ int error;
+
+ xfs_ilock(tp->t_mountp->m_rbmip, XFS_ILOCK_SHARED);
+
+ alow.ar_startblock = info->low.rm_startblock;
+ ahigh.ar_startblock = info->high.rm_startblock;
+ error = xfs_rtalloc_query_range(tp, &alow, &ahigh,
+ xfs_getfsmap_rtdev_rtbitmap_helper, info);
+ if (error)
+ goto err;
+
+ /* Report any gaps at the end of the rtbitmap */
+ info->last = true;
+ error = xfs_getfsmap_rtdev_rtbitmap_helper(tp, &ahigh, info);
+ if (error)
+ goto err;
+err:
+ xfs_iunlock(tp->t_mountp->m_rbmip, XFS_ILOCK_SHARED);
+ return error;
+}
+
+/* Execute a getfsmap query against the realtime device rtbitmap. */
+STATIC int
+xfs_getfsmap_rtdev_rtbitmap(
+ struct xfs_trans *tp,
+ struct xfs_fsmap *keys,
+ struct xfs_getfsmap_info *info)
+{
+ info->missing_owner = XFS_FMR_OWN_UNKNOWN;
+ return __xfs_getfsmap_rtdev(tp, keys, xfs_getfsmap_rtdev_rtbitmap_query,
+ info);
+}
+
+/* Execute a getfsmap query against the regular data device. */
+STATIC int
+__xfs_getfsmap_datadev(
+ struct xfs_trans *tp,
+ struct xfs_fsmap *keys,
+ struct xfs_getfsmap_info *info,
+ int (*query_fn)(struct xfs_trans *,
+ struct xfs_getfsmap_info *,
+ struct xfs_btree_cur **,
+ void *),
+ void *priv)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_btree_cur *bt_cur = NULL;
+ xfs_fsblock_t start_fsb;
+ xfs_fsblock_t end_fsb;
+ xfs_agnumber_t start_ag;
+ xfs_agnumber_t end_ag;
+ xfs_daddr_t eofs;
+ int error = 0;
+
+ eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks);
+ if (keys[0].fmr_physical >= eofs)
+ return 0;
+ if (keys[1].fmr_physical >= eofs)
+ keys[1].fmr_physical = eofs - 1;
+ start_fsb = XFS_DADDR_TO_FSB(mp, keys[0].fmr_physical);
+ end_fsb = XFS_DADDR_TO_FSB(mp, keys[1].fmr_physical);
+
+ /*
+ * Convert the fsmap low/high keys to AG based keys. Initialize
+ * low to the fsmap low key and max out the high key to the end
+ * of the AG.
+ */
+ info->low.rm_startblock = XFS_FSB_TO_AGBNO(mp, start_fsb);
+ info->low.rm_offset = XFS_BB_TO_FSBT(mp, keys[0].fmr_offset);
+ error = xfs_fsmap_owner_to_rmap(&info->low, &keys[0]);
+ if (error)
+ return error;
+ info->low.rm_blockcount = 0;
+ xfs_getfsmap_set_irec_flags(&info->low, &keys[0]);
+
+ info->high.rm_startblock = -1U;
+ info->high.rm_owner = ULLONG_MAX;
+ info->high.rm_offset = ULLONG_MAX;
+ info->high.rm_blockcount = 0;
+ info->high.rm_flags = XFS_RMAP_KEY_FLAGS | XFS_RMAP_REC_FLAGS;
+
+ start_ag = XFS_FSB_TO_AGNO(mp, start_fsb);
+ end_ag = XFS_FSB_TO_AGNO(mp, end_fsb);
+
+ /* Query each AG */
+ for (info->agno = start_ag; info->agno <= end_ag; info->agno++) {
+ /*
+ * Set the AG high key from the fsmap high key if this
+ * is the last AG that we're querying.
+ */
+ if (info->agno == end_ag) {
+ info->high.rm_startblock = XFS_FSB_TO_AGBNO(mp,
+ end_fsb);
+ info->high.rm_offset = XFS_BB_TO_FSBT(mp,
+ keys[1].fmr_offset);
+ error = xfs_fsmap_owner_to_rmap(&info->high, &keys[1]);
+ if (error)
+ goto err;
+ xfs_getfsmap_set_irec_flags(&info->high, &keys[1]);
+ }
+
+ if (bt_cur) {
+ xfs_btree_del_cursor(bt_cur, XFS_BTREE_NOERROR);
+ bt_cur = NULL;
+ xfs_trans_brelse(tp, info->agf_bp);
+ info->agf_bp = NULL;
+ }
+
+ error = xfs_alloc_read_agf(mp, tp, info->agno, 0,
+ &info->agf_bp);
+ if (error)
+ goto err;
+
+ trace_xfs_fsmap_low_key(mp, info->dev, info->agno, &info->low);
+ trace_xfs_fsmap_high_key(mp, info->dev, info->agno,
+ &info->high);
+
+ error = query_fn(tp, info, &bt_cur, priv);
+ if (error)
+ goto err;
+
+ /*
+ * Set the AG low key to the start of the AG prior to
+ * moving on to the next AG.
+ */
+ if (info->agno == start_ag) {
+ info->low.rm_startblock = 0;
+ info->low.rm_owner = 0;
+ info->low.rm_offset = 0;
+ info->low.rm_flags = 0;
+ }
+ }
+
+ /* Report any gap at the end of the AG */
+ info->last = true;
+ error = query_fn(tp, info, &bt_cur, priv);
+ if (error)
+ goto err;
+
+err:
+ if (bt_cur)
+ xfs_btree_del_cursor(bt_cur, error < 0 ? XFS_BTREE_ERROR :
+ XFS_BTREE_NOERROR);
+ if (info->agf_bp) {
+ xfs_trans_brelse(tp, info->agf_bp);
+ info->agf_bp = NULL;
+ }
+
+ return error;
+}
+
+/* Actually query the rmap btree. */
+STATIC int
+xfs_getfsmap_datadev_rmapbt_query(
+ struct xfs_trans *tp,
+ struct xfs_getfsmap_info *info,
+ struct xfs_btree_cur **curpp,
+ void *priv)
+{
+ /* Report any gap at the end of the last AG. */
+ if (info->last)
+ return xfs_getfsmap_datadev_helper(*curpp, &info->high, info);
+
+ /* Allocate cursor for this AG and query_range it. */
+ *curpp = xfs_rmapbt_init_cursor(tp->t_mountp, tp, info->agf_bp,
+ info->agno);
+ return xfs_rmap_query_range(*curpp, &info->low, &info->high,
+ xfs_getfsmap_datadev_helper, info);
+}
+
+/* Execute a getfsmap query against the regular data device rmapbt. */
+STATIC int
+xfs_getfsmap_datadev_rmapbt(
+ struct xfs_trans *tp,
+ struct xfs_fsmap *keys,
+ struct xfs_getfsmap_info *info)
+{
+ info->missing_owner = XFS_FMR_OWN_FREE;
+ return __xfs_getfsmap_datadev(tp, keys, info,
+ xfs_getfsmap_datadev_rmapbt_query, NULL);
+}
+
+/* Actually query the bno btree. */
+STATIC int
+xfs_getfsmap_datadev_bnobt_query(
+ struct xfs_trans *tp,
+ struct xfs_getfsmap_info *info,
+ struct xfs_btree_cur **curpp,
+ void *priv)
+{
+ struct xfs_alloc_rec_incore *key = priv;
+
+ /* Report any gap at the end of the last AG. */
+ if (info->last)
+ return xfs_getfsmap_datadev_bnobt_helper(*curpp, &key[1], info);
+
+ /* Allocate cursor for this AG and query_range it. */
+ *curpp = xfs_allocbt_init_cursor(tp->t_mountp, tp, info->agf_bp,
+ info->agno, XFS_BTNUM_BNO);
+ key->ar_startblock = info->low.rm_startblock;
+ key[1].ar_startblock = info->high.rm_startblock;
+ return xfs_alloc_query_range(*curpp, key, &key[1],
+ xfs_getfsmap_datadev_bnobt_helper, info);
+}
+
+/* Execute a getfsmap query against the regular data device's bnobt. */
+STATIC int
+xfs_getfsmap_datadev_bnobt(
+ struct xfs_trans *tp,
+ struct xfs_fsmap *keys,
+ struct xfs_getfsmap_info *info)
+{
+ struct xfs_alloc_rec_incore akeys[2];
+
+ info->missing_owner = XFS_FMR_OWN_UNKNOWN;
+ return __xfs_getfsmap_datadev(tp, keys, info,
+ xfs_getfsmap_datadev_bnobt_query, &akeys[0]);
+}
+
+/* Do we recognize the device? */
+STATIC bool
+xfs_getfsmap_is_valid_device(
+ struct xfs_mount *mp,
+ struct xfs_fsmap *fm)
+{
+ if (fm->fmr_device == 0 || fm->fmr_device == UINT_MAX ||
+ fm->fmr_device == new_encode_dev(mp->m_ddev_targp->bt_dev))
+ return true;
+ if (mp->m_logdev_targp &&
+ fm->fmr_device == new_encode_dev(mp->m_logdev_targp->bt_dev))
+ return true;
+ if (mp->m_rtdev_targp &&
+ fm->fmr_device == new_encode_dev(mp->m_rtdev_targp->bt_dev))
+ return true;
+ return false;
+}
+
+/* Ensure that the low key is less than the high key. */
+STATIC bool
+xfs_getfsmap_check_keys(
+ struct xfs_fsmap *low_key,
+ struct xfs_fsmap *high_key)
+{
+ if (low_key->fmr_device > high_key->fmr_device)
+ return false;
+ if (low_key->fmr_device < high_key->fmr_device)
+ return true;
+
+ if (low_key->fmr_physical > high_key->fmr_physical)
+ return false;
+ if (low_key->fmr_physical < high_key->fmr_physical)
+ return true;
+
+ if (low_key->fmr_owner > high_key->fmr_owner)
+ return false;
+ if (low_key->fmr_owner < high_key->fmr_owner)
+ return true;
+
+ if (low_key->fmr_offset > high_key->fmr_offset)
+ return false;
+ if (low_key->fmr_offset < high_key->fmr_offset)
+ return true;
+
+ return false;
+}
+
+#define XFS_GETFSMAP_DEVS 3
+/*
+ * Get filesystem's extents as described in head, and format for
+ * output. Calls formatter to fill the user's buffer until all
+ * extents are mapped, until the passed-in head->fmh_count slots have
+ * been filled, or until the formatter short-circuits the loop, if it
+ * is tracking filled-in extents on its own.
+ *
+ * Key to Confusion
+ * ----------------
+ * There are multiple levels of keys and counters at work here:
+ * xfs_fsmap_head.fmh_keys -- low and high fsmap keys passed in;
+ * these reflect fs-wide sector addrs.
+ * dkeys -- fmh_keys used to query each device;
+ * these are fmh_keys but w/ the low key
+ * bumped up by fmr_length.
+ * xfs_getfsmap_info.next_daddr -- next disk addr we expect to see; this
+ * is how we detect gaps in the fsmap
+ records and report them.
+ * xfs_getfsmap_info.low/high -- per-AG low/high keys computed from
+ * dkeys; used to query the metadata.
+ */
+int
+xfs_getfsmap(
+ struct xfs_mount *mp,
+ struct xfs_fsmap_head *head,
+ xfs_fsmap_format_t formatter,
+ void *arg)
+{
+ struct xfs_trans *tp = NULL;
+ struct xfs_fsmap dkeys[2]; /* per-dev keys */
+ struct xfs_getfsmap_dev handlers[XFS_GETFSMAP_DEVS];
+ struct xfs_getfsmap_info info = { NULL };
+ int i;
+ int error = 0;
+
+ if (head->fmh_iflags & ~FMH_IF_VALID)
+ return -EINVAL;
+ if (!xfs_getfsmap_is_valid_device(mp, &head->fmh_keys[0]) ||
+ !xfs_getfsmap_is_valid_device(mp, &head->fmh_keys[1]))
+ return -EINVAL;
+
+ head->fmh_entries = 0;
+
+ /* Set up our device handlers. */
+ memset(handlers, 0, sizeof(handlers));
+ handlers[0].dev = new_encode_dev(mp->m_ddev_targp->bt_dev);
+ if (xfs_sb_version_hasrmapbt(&mp->m_sb))
+ handlers[0].fn = xfs_getfsmap_datadev_rmapbt;
+ else
+ handlers[0].fn = xfs_getfsmap_datadev_bnobt;
+ if (mp->m_logdev_targp != mp->m_ddev_targp) {
+ handlers[1].dev = new_encode_dev(mp->m_logdev_targp->bt_dev);
+ handlers[1].fn = xfs_getfsmap_logdev;
+ }
+ if (mp->m_rtdev_targp) {
+ handlers[2].dev = new_encode_dev(mp->m_rtdev_targp->bt_dev);
+ handlers[2].fn = xfs_getfsmap_rtdev_rtbitmap;
+ }
+
+ xfs_sort(handlers, XFS_GETFSMAP_DEVS, sizeof(struct xfs_getfsmap_dev),
+ xfs_getfsmap_dev_compare);
+
+ /*
+ * To continue where we left off, we allow userspace to use the
+ * last mapping from a previous call as the low key of the next.
+ * This is identified by a non-zero length in the low key. We
+ * have to increment the low key in this scenario to ensure we
+ * don't return the same mapping again, and instead return the
+ * very next mapping.
+ *
+ * If the low key mapping refers to file data, the same physical
+ * blocks could be mapped to several other files/offsets.
+ * According to rmapbt record ordering, the minimal next
+ * possible record for the block range is the next starting
+ * offset in the same inode. Therefore, bump the file offset to
+ * continue the search appropriately. For all other low key
+ * mapping types (attr blocks, metadata), bump the physical
+ * offset as there can be no other mapping for the same physical
+ * block range.
+ */
+ dkeys[0] = head->fmh_keys[0];
+ if (dkeys[0].fmr_flags & (FMR_OF_SPECIAL_OWNER | FMR_OF_EXTENT_MAP)) {
+ dkeys[0].fmr_physical += dkeys[0].fmr_length;
+ dkeys[0].fmr_owner = 0;
+ if (dkeys[0].fmr_offset)
+ return -EINVAL;
+ } else
+ dkeys[0].fmr_offset += dkeys[0].fmr_length;
+ dkeys[0].fmr_length = 0;
+ memset(&dkeys[1], 0xFF, sizeof(struct xfs_fsmap));
+
+ if (!xfs_getfsmap_check_keys(dkeys, &head->fmh_keys[1]))
+ return -EINVAL;
+
+ info.next_daddr = head->fmh_keys[0].fmr_physical +
+ head->fmh_keys[0].fmr_length;
+ info.formatter = formatter;
+ info.format_arg = arg;
+ info.head = head;
+
+ /* For each device we support... */
+ for (i = 0; i < XFS_GETFSMAP_DEVS; i++) {
+ /* Is this device within the range the user asked for? */
+ if (!handlers[i].fn)
+ continue;
+ if (head->fmh_keys[0].fmr_device > handlers[i].dev)
+ continue;
+ if (head->fmh_keys[1].fmr_device < handlers[i].dev)
+ break;
+
+ /*
+ * If this device number matches the high key, we have
+ * to pass the high key to the handler to limit the
+ * query results. If the device number exceeds the
+ * low key, zero out the low key so that we get
+ * everything from the beginning.
+ */
+ if (handlers[i].dev == head->fmh_keys[1].fmr_device)
+ dkeys[1] = head->fmh_keys[1];
+ if (handlers[i].dev > head->fmh_keys[0].fmr_device)
+ memset(&dkeys[0], 0, sizeof(struct xfs_fsmap));
+
+ error = xfs_trans_alloc_empty(mp, &tp);
+ if (error)
+ break;
+
+ info.dev = handlers[i].dev;
+ info.last = false;
+ info.agno = NULLAGNUMBER;
+ error = handlers[i].fn(tp, dkeys, &info);
+ if (error)
+ break;
+ xfs_trans_cancel(tp);
+ tp = NULL;
+ info.next_daddr = 0;
+ }
+
+ if (tp)
+ xfs_trans_cancel(tp);
+ head->fmh_oflags = FMH_OF_DEV_T;
+ return error;
+}
diff --git a/fs/xfs/xfs_fsmap.h b/fs/xfs/xfs_fsmap.h
new file mode 100644
index 000000000000..0b9bf822595c
--- /dev/null
+++ b/fs/xfs/xfs_fsmap.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (C) 2017 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#ifndef __XFS_FSMAP_H__
+#define __XFS_FSMAP_H__
+
+struct fsmap;
+
+/* internal fsmap representation */
+struct xfs_fsmap {
+ dev_t fmr_device; /* device id */
+ uint32_t fmr_flags; /* mapping flags */
+ uint64_t fmr_physical; /* device offset of segment */
+ uint64_t fmr_owner; /* owner id */
+ xfs_fileoff_t fmr_offset; /* file offset of segment */
+ xfs_filblks_t fmr_length; /* length of segment, blocks */
+};
+
+struct xfs_fsmap_head {
+ uint32_t fmh_iflags; /* control flags */
+ uint32_t fmh_oflags; /* output flags */
+ unsigned int fmh_count; /* # of entries in array incl. input */
+ unsigned int fmh_entries; /* # of entries filled in (output). */
+
+ struct xfs_fsmap fmh_keys[2]; /* low and high keys */
+};
+
+void xfs_fsmap_from_internal(struct fsmap *dest, struct xfs_fsmap *src);
+void xfs_fsmap_to_internal(struct xfs_fsmap *dest, struct fsmap *src);
+
+/* fsmap to userspace formatter - copy to user & advance pointer */
+typedef int (*xfs_fsmap_format_t)(struct xfs_fsmap *, void *);
+
+int xfs_getfsmap(struct xfs_mount *mp, struct xfs_fsmap_head *head,
+ xfs_fsmap_format_t formatter, void *arg);
+
+#endif /* __XFS_FSMAP_H__ */
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 3531f8f72fa5..f61c84f8e31a 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -262,6 +262,22 @@ xfs_inode_clear_reclaim_tag(
xfs_perag_clear_reclaim_tag(pag);
}
+static void
+xfs_inew_wait(
+ struct xfs_inode *ip)
+{
+ wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_INEW_BIT);
+ DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_INEW_BIT);
+
+ do {
+ prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+ if (!xfs_iflags_test(ip, XFS_INEW))
+ break;
+ schedule();
+ } while (true);
+ finish_wait(wq, &wait.wait);
+}
+
/*
* When we recycle a reclaimable inode, we need to re-initialise the VFS inode
* part of the structure. This is made more complex by the fact we store
@@ -366,14 +382,17 @@ xfs_iget_cache_hit(
error = xfs_reinit_inode(mp, inode);
if (error) {
+ bool wake;
/*
* Re-initializing the inode failed, and we are in deep
* trouble. Try to re-add it to the reclaim list.
*/
rcu_read_lock();
spin_lock(&ip->i_flags_lock);
-
+ wake = !!__xfs_iflags_test(ip, XFS_INEW);
ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM);
+ if (wake)
+ wake_up_bit(&ip->i_flags, __XFS_INEW_BIT);
ASSERT(ip->i_flags & XFS_IRECLAIMABLE);
trace_xfs_iget_reclaim_fail(ip);
goto out_error;
@@ -623,9 +642,11 @@ out_error_or_again:
STATIC int
xfs_inode_ag_walk_grab(
- struct xfs_inode *ip)
+ struct xfs_inode *ip,
+ int flags)
{
struct inode *inode = VFS_I(ip);
+ bool newinos = !!(flags & XFS_AGITER_INEW_WAIT);
ASSERT(rcu_read_lock_held());
@@ -643,7 +664,8 @@ xfs_inode_ag_walk_grab(
goto out_unlock_noent;
/* avoid new or reclaimable inodes. Leave for reclaim code to flush */
- if (__xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
+ if ((!newinos && __xfs_iflags_test(ip, XFS_INEW)) ||
+ __xfs_iflags_test(ip, XFS_IRECLAIMABLE | XFS_IRECLAIM))
goto out_unlock_noent;
spin_unlock(&ip->i_flags_lock);
@@ -671,7 +693,8 @@ xfs_inode_ag_walk(
void *args),
int flags,
void *args,
- int tag)
+ int tag,
+ int iter_flags)
{
uint32_t first_index;
int last_error = 0;
@@ -713,7 +736,7 @@ restart:
for (i = 0; i < nr_found; i++) {
struct xfs_inode *ip = batch[i];
- if (done || xfs_inode_ag_walk_grab(ip))
+ if (done || xfs_inode_ag_walk_grab(ip, iter_flags))
batch[i] = NULL;
/*
@@ -741,6 +764,9 @@ restart:
for (i = 0; i < nr_found; i++) {
if (!batch[i])
continue;
+ if ((iter_flags & XFS_AGITER_INEW_WAIT) &&
+ xfs_iflags_test(batch[i], XFS_INEW))
+ xfs_inew_wait(batch[i]);
error = execute(batch[i], flags, args);
IRELE(batch[i]);
if (error == -EAGAIN) {
@@ -820,12 +846,13 @@ xfs_cowblocks_worker(
}
int
-xfs_inode_ag_iterator(
+xfs_inode_ag_iterator_flags(
struct xfs_mount *mp,
int (*execute)(struct xfs_inode *ip, int flags,
void *args),
int flags,
- void *args)
+ void *args,
+ int iter_flags)
{
struct xfs_perag *pag;
int error = 0;
@@ -835,7 +862,8 @@ xfs_inode_ag_iterator(
ag = 0;
while ((pag = xfs_perag_get(mp, ag))) {
ag = pag->pag_agno + 1;
- error = xfs_inode_ag_walk(mp, pag, execute, flags, args, -1);
+ error = xfs_inode_ag_walk(mp, pag, execute, flags, args, -1,
+ iter_flags);
xfs_perag_put(pag);
if (error) {
last_error = error;
@@ -847,6 +875,17 @@ xfs_inode_ag_iterator(
}
int
+xfs_inode_ag_iterator(
+ struct xfs_mount *mp,
+ int (*execute)(struct xfs_inode *ip, int flags,
+ void *args),
+ int flags,
+ void *args)
+{
+ return xfs_inode_ag_iterator_flags(mp, execute, flags, args, 0);
+}
+
+int
xfs_inode_ag_iterator_tag(
struct xfs_mount *mp,
int (*execute)(struct xfs_inode *ip, int flags,
@@ -863,7 +902,8 @@ xfs_inode_ag_iterator_tag(
ag = 0;
while ((pag = xfs_perag_get_tag(mp, ag, tag))) {
ag = pag->pag_agno + 1;
- error = xfs_inode_ag_walk(mp, pag, execute, flags, args, tag);
+ error = xfs_inode_ag_walk(mp, pag, execute, flags, args, tag,
+ 0);
xfs_perag_put(pag);
if (error) {
last_error = error;
diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h
index 8a7c849b4dea..9183f77958ef 100644
--- a/fs/xfs/xfs_icache.h
+++ b/fs/xfs/xfs_icache.h
@@ -48,6 +48,11 @@ struct xfs_eofblocks {
#define XFS_IGET_UNTRUSTED 0x2
#define XFS_IGET_DONTCACHE 0x4
+/*
+ * flags for AG inode iterator
+ */
+#define XFS_AGITER_INEW_WAIT 0x1 /* wait on new inodes */
+
int xfs_iget(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t ino,
uint flags, uint lock_flags, xfs_inode_t **ipp);
@@ -79,6 +84,9 @@ void xfs_cowblocks_worker(struct work_struct *);
int xfs_inode_ag_iterator(struct xfs_mount *mp,
int (*execute)(struct xfs_inode *ip, int flags, void *args),
int flags, void *args);
+int xfs_inode_ag_iterator_flags(struct xfs_mount *mp,
+ int (*execute)(struct xfs_inode *ip, int flags, void *args),
+ int flags, void *args, int iter_flags);
int xfs_inode_ag_iterator_tag(struct xfs_mount *mp,
int (*execute)(struct xfs_inode *ip, int flags, void *args),
int flags, void *args, int tag);
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 7605d8396596..ec9826c56500 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1906,12 +1906,13 @@ xfs_inactive(
* force is true because we are evicting an inode from the
* cache. Post-eof blocks must be freed, lest we end up with
* broken free space accounting.
+ *
+ * Note: don't bother with iolock here since lockdep complains
+ * about acquiring it in reclaim context. We have the only
+ * reference to the inode at this point anyways.
*/
- if (xfs_can_free_eofblocks(ip, true)) {
- xfs_ilock(ip, XFS_IOLOCK_EXCL);
+ if (xfs_can_free_eofblocks(ip, true))
xfs_free_eofblocks(ip);
- xfs_iunlock(ip, XFS_IOLOCK_EXCL);
- }
return;
}
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 10dcf27b4c85..10e89fcb49d7 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -216,7 +216,8 @@ static inline bool xfs_is_reflink_inode(struct xfs_inode *ip)
#define XFS_IRECLAIM (1 << 0) /* started reclaiming this inode */
#define XFS_ISTALE (1 << 1) /* inode has been staled */
#define XFS_IRECLAIMABLE (1 << 2) /* inode can be reclaimed */
-#define XFS_INEW (1 << 3) /* inode has just been allocated */
+#define __XFS_INEW_BIT 3 /* inode has just been allocated */
+#define XFS_INEW (1 << __XFS_INEW_BIT)
#define XFS_ITRUNCATED (1 << 5) /* truncated down so flush-on-close */
#define XFS_IDIRTY_RELEASE (1 << 6) /* dirty release already seen */
#define __XFS_IFLOCK_BIT 7 /* inode is being flushed right now */
@@ -464,6 +465,7 @@ static inline void xfs_finish_inode_setup(struct xfs_inode *ip)
xfs_iflags_clear(ip, XFS_INEW);
barrier();
unlock_new_inode(VFS_I(ip));
+ wake_up_bit(&ip->i_flags, __XFS_INEW_BIT);
}
static inline void xfs_setup_existing_inode(struct xfs_inode *ip)
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index d90e7811ccdd..08cb7d1a4a3a 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -731,22 +731,27 @@ xfs_iflush_done(
* holding the lock before removing the inode from the AIL.
*/
if (need_ail) {
- struct xfs_log_item *log_items[need_ail];
- int i = 0;
+ bool mlip_changed = false;
+
+ /* this is an opencoded batch version of xfs_trans_ail_delete */
spin_lock(&ailp->xa_lock);
for (blip = lip; blip; blip = blip->li_bio_list) {
- iip = INODE_ITEM(blip);
- if (iip->ili_logged &&
- blip->li_lsn == iip->ili_flush_lsn) {
- log_items[i++] = blip;
- }
- ASSERT(i <= need_ail);
+ if (INODE_ITEM(blip)->ili_logged &&
+ blip->li_lsn == INODE_ITEM(blip)->ili_flush_lsn)
+ mlip_changed |= xfs_ail_delete_one(ailp, blip);
}
- /* xfs_trans_ail_delete_bulk() drops the AIL lock. */
- xfs_trans_ail_delete_bulk(ailp, log_items, i,
- SHUTDOWN_CORRUPT_INCORE);
- }
+ if (mlip_changed) {
+ if (!XFS_FORCED_SHUTDOWN(ailp->xa_mount))
+ xlog_assign_tail_lsn_locked(ailp->xa_mount);
+ if (list_empty(&ailp->xa_ail))
+ wake_up_all(&ailp->xa_empty);
+ }
+ spin_unlock(&ailp->xa_lock);
+
+ if (mlip_changed)
+ xfs_log_space_wake(ailp->xa_mount);
+ }
/*
* clean up and unlock the flush lock now we are done. We can clear the
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 2fd7fdf5438f..6190697603c9 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -41,6 +41,9 @@
#include "xfs_trans.h"
#include "xfs_pnfs.h"
#include "xfs_acl.h"
+#include "xfs_btree.h"
+#include <linux/fsmap.h>
+#include "xfs_fsmap.h"
#include <linux/capability.h>
#include <linux/cred.h>
@@ -1543,10 +1546,11 @@ xfs_ioc_getbmap(
unsigned int cmd,
void __user *arg)
{
- struct getbmapx bmx;
+ struct getbmapx bmx = { 0 };
int error;
- if (copy_from_user(&bmx, arg, sizeof(struct getbmapx)))
+ /* struct getbmap is a strict subset of struct getbmapx. */
+ if (copy_from_user(&bmx, arg, offsetof(struct getbmapx, bmv_iflags)))
return -EFAULT;
if (bmx.bmv_count < 2)
@@ -1608,6 +1612,84 @@ xfs_ioc_getbmapx(
return 0;
}
+struct getfsmap_info {
+ struct xfs_mount *mp;
+ struct fsmap_head __user *data;
+ unsigned int idx;
+ __u32 last_flags;
+};
+
+STATIC int
+xfs_getfsmap_format(struct xfs_fsmap *xfm, void *priv)
+{
+ struct getfsmap_info *info = priv;
+ struct fsmap fm;
+
+ trace_xfs_getfsmap_mapping(info->mp, xfm);
+
+ info->last_flags = xfm->fmr_flags;
+ xfs_fsmap_from_internal(&fm, xfm);
+ if (copy_to_user(&info->data->fmh_recs[info->idx++], &fm,
+ sizeof(struct fsmap)))
+ return -EFAULT;
+
+ return 0;
+}
+
+STATIC int
+xfs_ioc_getfsmap(
+ struct xfs_inode *ip,
+ struct fsmap_head __user *arg)
+{
+ struct getfsmap_info info = { NULL };
+ struct xfs_fsmap_head xhead = {0};
+ struct fsmap_head head;
+ bool aborted = false;
+ int error;
+
+ if (copy_from_user(&head, arg, sizeof(struct fsmap_head)))
+ return -EFAULT;
+ if (memchr_inv(head.fmh_reserved, 0, sizeof(head.fmh_reserved)) ||
+ memchr_inv(head.fmh_keys[0].fmr_reserved, 0,
+ sizeof(head.fmh_keys[0].fmr_reserved)) ||
+ memchr_inv(head.fmh_keys[1].fmr_reserved, 0,
+ sizeof(head.fmh_keys[1].fmr_reserved)))
+ return -EINVAL;
+
+ xhead.fmh_iflags = head.fmh_iflags;
+ xhead.fmh_count = head.fmh_count;
+ xfs_fsmap_to_internal(&xhead.fmh_keys[0], &head.fmh_keys[0]);
+ xfs_fsmap_to_internal(&xhead.fmh_keys[1], &head.fmh_keys[1]);
+
+ trace_xfs_getfsmap_low_key(ip->i_mount, &xhead.fmh_keys[0]);
+ trace_xfs_getfsmap_high_key(ip->i_mount, &xhead.fmh_keys[1]);
+
+ info.mp = ip->i_mount;
+ info.data = arg;
+ error = xfs_getfsmap(ip->i_mount, &xhead, xfs_getfsmap_format, &info);
+ if (error == XFS_BTREE_QUERY_RANGE_ABORT) {
+ error = 0;
+ aborted = true;
+ } else if (error)
+ return error;
+
+ /* If we didn't abort, set the "last" flag in the last fmx */
+ if (!aborted && info.idx) {
+ info.last_flags |= FMR_OF_LAST;
+ if (copy_to_user(&info.data->fmh_recs[info.idx - 1].fmr_flags,
+ &info.last_flags, sizeof(info.last_flags)))
+ return -EFAULT;
+ }
+
+ /* copy back header */
+ head.fmh_entries = xhead.fmh_entries;
+ head.fmh_oflags = xhead.fmh_oflags;
+ if (copy_to_user(arg, &head, sizeof(struct fsmap_head)))
+ return -EFAULT;
+
+ return 0;
+}
+
int
xfs_ioc_swapext(
xfs_swapext_t *sxp)
@@ -1788,6 +1870,9 @@ xfs_file_ioctl(
case XFS_IOC_GETBMAPX:
return xfs_ioc_getbmapx(ip, arg);
+ case FS_IOC_GETFSMAP:
+ return xfs_ioc_getfsmap(ip, arg);
+
case XFS_IOC_FD_TO_HANDLE:
case XFS_IOC_PATH_TO_HANDLE:
case XFS_IOC_PATH_TO_FSHANDLE: {
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index 7c49938c5aed..fa0bc4d46065 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -20,6 +20,7 @@
#include <linux/mount.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
+#include <linux/fsmap.h>
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_format.h"
@@ -554,6 +555,7 @@ xfs_file_compat_ioctl(
case XFS_IOC_GOINGDOWN:
case XFS_IOC_ERROR_INJECTION:
case XFS_IOC_ERROR_CLEARALL:
+ case FS_IOC_GETFSMAP:
return xfs_file_ioctl(filp, cmd, p);
#ifndef BROKEN_X86_ALIGNMENT
/* These are handled fine if no alignment issues */
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 288ee5b840d7..a63f61c256bd 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -240,7 +240,7 @@ xfs_iomap_write_direct(
*/
if (IS_DAX(VFS_I(ip))) {
bmapi_flags = XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO;
- if (ISUNWRITTEN(imap)) {
+ if (imap->br_state == XFS_EXT_UNWRITTEN) {
tflags |= XFS_TRANS_RESERVE;
resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0) << 1;
}
@@ -945,7 +945,7 @@ static inline bool imap_needs_alloc(struct inode *inode,
return !nimaps ||
imap->br_startblock == HOLESTARTBLOCK ||
imap->br_startblock == DELAYSTARTBLOCK ||
- (IS_DAX(inode) && ISUNWRITTEN(imap));
+ (IS_DAX(inode) && imap->br_state == XFS_EXT_UNWRITTEN);
}
static inline bool need_excl_ilock(struct xfs_inode *ip, unsigned flags)
@@ -976,6 +976,7 @@ xfs_file_iomap_begin(
int nimaps = 1, error = 0;
bool shared = false, trimmed = false;
unsigned lockmode;
+ struct block_device *bdev;
if (XFS_FORCED_SHUTDOWN(mp))
return -EIO;
@@ -1063,6 +1064,14 @@ xfs_file_iomap_begin(
}
xfs_bmbt_to_iomap(ip, iomap, &imap);
+
+ /* optionally associate a dax device with the iomap bdev */
+ bdev = iomap->bdev;
+ if (blk_queue_dax(bdev->bd_queue))
+ iomap->dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
+ else
+ iomap->dax_dev = NULL;
+
if (shared)
iomap->flags |= IOMAP_F_SHARED;
return 0;
@@ -1140,6 +1149,7 @@ xfs_file_iomap_end(
unsigned flags,
struct iomap *iomap)
{
+ put_dax(iomap->dax_dev);
if ((flags & IOMAP_WRITE) && iomap->type == IOMAP_DELALLOC)
return xfs_file_iomap_end_delalloc(XFS_I(inode), offset,
length, written, iomap);
@@ -1170,10 +1180,10 @@ xfs_xattr_iomap_begin(
if (XFS_FORCED_SHUTDOWN(mp))
return -EIO;
- lockmode = xfs_ilock_data_map_shared(ip);
+ lockmode = xfs_ilock_attr_map_shared(ip);
/* if there are no attribute fork or extents, return ENOENT */
- if (XFS_IFORK_Q(ip) || !ip->i_d.di_anextents) {
+ if (!XFS_IFORK_Q(ip) || !ip->i_d.di_anextents) {
error = -ENOENT;
goto out_unlock;
}
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index 592fdf7111cb..044fb0e15390 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -212,88 +212,6 @@ static inline kgid_t xfs_gid_to_kgid(__uint32_t gid)
#define xfs_sort(a,n,s,fn) sort(a,n,s,fn,NULL)
#define xfs_stack_trace() dump_stack()
-
-/* Move the kernel do_div definition off to one side */
-
-#if defined __i386__
-/* For ia32 we need to pull some tricks to get past various versions
- * of the compiler which do not like us using do_div in the middle
- * of large functions.
- */
-static inline __u32 xfs_do_div(void *a, __u32 b, int n)
-{
- __u32 mod;
-
- switch (n) {
- case 4:
- mod = *(__u32 *)a % b;
- *(__u32 *)a = *(__u32 *)a / b;
- return mod;
- case 8:
- {
- unsigned long __upper, __low, __high, __mod;
- __u64 c = *(__u64 *)a;
- __upper = __high = c >> 32;
- __low = c;
- if (__high) {
- __upper = __high % (b);
- __high = __high / (b);
- }
- asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (b), "0" (__low), "1" (__upper));
- asm("":"=A" (c):"a" (__low),"d" (__high));
- *(__u64 *)a = c;
- return __mod;
- }
- }
-
- /* NOTREACHED */
- return 0;
-}
-
-/* Side effect free 64 bit mod operation */
-static inline __u32 xfs_do_mod(void *a, __u32 b, int n)
-{
- switch (n) {
- case 4:
- return *(__u32 *)a % b;
- case 8:
- {
- unsigned long __upper, __low, __high, __mod;
- __u64 c = *(__u64 *)a;
- __upper = __high = c >> 32;
- __low = c;
- if (__high) {
- __upper = __high % (b);
- __high = __high / (b);
- }
- asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (b), "0" (__low), "1" (__upper));
- asm("":"=A" (c):"a" (__low),"d" (__high));
- return __mod;
- }
- }
-
- /* NOTREACHED */
- return 0;
-}
-#else
-static inline __u32 xfs_do_div(void *a, __u32 b, int n)
-{
- __u32 mod;
-
- switch (n) {
- case 4:
- mod = *(__u32 *)a % b;
- *(__u32 *)a = *(__u32 *)a / b;
- return mod;
- case 8:
- mod = do_div(*(__u64 *)a, b);
- return mod;
- }
-
- /* NOTREACHED */
- return 0;
-}
-
/* Side effect free 64 bit mod operation */
static inline __u32 xfs_do_mod(void *a, __u32 b, int n)
{
@@ -310,10 +228,7 @@ static inline __u32 xfs_do_mod(void *a, __u32 b, int n)
/* NOTREACHED */
return 0;
}
-#endif
-#undef do_div
-#define do_div(a, b) xfs_do_div(&(a), (b), sizeof(a))
#define do_mod(a, b) xfs_do_mod(&(a), (b), sizeof(a))
static inline __uint64_t roundup_64(__uint64_t x, __uint32_t y)
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index b1469f0a91a6..3731f13f63e9 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -1293,7 +1293,7 @@ void
xfs_log_work_queue(
struct xfs_mount *mp)
{
- queue_delayed_work(mp->m_log_workqueue, &mp->m_log->l_work,
+ queue_delayed_work(mp->m_sync_workqueue, &mp->m_log->l_work,
msecs_to_jiffies(xfs_syncd_centisecs * 10));
}
@@ -1852,7 +1852,7 @@ xlog_sync(
*/
if (log->l_badcrc_factor &&
(prandom_u32() % log->l_badcrc_factor == 0)) {
- iclog->ic_header.h_crc &= 0xAAAAAAAA;
+ iclog->ic_header.h_crc &= cpu_to_le32(0xAAAAAAAA);
iclog->ic_state |= XLOG_STATE_IOABORT;
xfs_warn(log->l_mp,
"Intentionally corrupted log record at LSN 0x%llx. Shutdown imminent.",
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 688ebff1f663..2eaf81859166 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -73,6 +73,10 @@ xfs_uuid_mount(
uuid_t *uuid = &mp->m_sb.sb_uuid;
int hole, i;
+ /* Publish UUID in struct super_block */
+ BUILD_BUG_ON(sizeof(mp->m_super->s_uuid) != sizeof(uuid_t));
+ memcpy(&mp->m_super->s_uuid, uuid, sizeof(uuid_t));
+
if (mp->m_flags & XFS_MOUNT_NOUUID)
return 0;
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 6db6fd6b82b0..9fa312a41c93 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -183,6 +183,7 @@ typedef struct xfs_mount {
struct workqueue_struct *m_reclaim_workqueue;
struct workqueue_struct *m_log_workqueue;
struct workqueue_struct *m_eofblocks_workqueue;
+ struct workqueue_struct *m_sync_workqueue;
/*
* Generation of the filesysyem layout. This is incremented by each
@@ -312,7 +313,7 @@ void xfs_do_force_shutdown(struct xfs_mount *mp, int flags, char *fname,
static inline xfs_agnumber_t
xfs_daddr_to_agno(struct xfs_mount *mp, xfs_daddr_t d)
{
- xfs_daddr_t ld = XFS_BB_TO_FSBT(mp, d);
+ xfs_rfsblock_t ld = XFS_BB_TO_FSBT(mp, d);
do_div(ld, mp->m_sb.sb_agblocks);
return (xfs_agnumber_t) ld;
}
@@ -320,7 +321,7 @@ xfs_daddr_to_agno(struct xfs_mount *mp, xfs_daddr_t d)
static inline xfs_agblock_t
xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d)
{
- xfs_daddr_t ld = XFS_BB_TO_FSBT(mp, d);
+ xfs_rfsblock_t ld = XFS_BB_TO_FSBT(mp, d);
return (xfs_agblock_t) do_div(ld, mp->m_sb.sb_agblocks);
}
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index b669b123287b..5fe6e70b88ef 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -851,8 +851,8 @@ xfs_qm_reset_dqcounts(
* started afresh by xfs_qm_quotacheck.
*/
#ifdef DEBUG
- j = XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB);
- do_div(j, sizeof(xfs_dqblk_t));
+ j = (int)XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB) /
+ sizeof(xfs_dqblk_t);
ASSERT(mp->m_quotainfo->qi_dqperchunk == j);
#endif
dqb = bp->b_addr;
@@ -1384,12 +1384,7 @@ xfs_qm_quotacheck(
mp->m_qflags |= flags;
error_return:
- while (!list_empty(&buffer_list)) {
- struct xfs_buf *bp =
- list_first_entry(&buffer_list, struct xfs_buf, b_list);
- list_del_init(&bp->b_list);
- xfs_buf_relse(bp);
- }
+ xfs_buf_delwri_cancel(&buffer_list);
if (error) {
xfs_warn(mp,
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 475a3882a81f..9cb5c381b01c 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -759,5 +759,6 @@ xfs_qm_dqrele_all_inodes(
uint flags)
{
ASSERT(mp->m_quotainfo);
- xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, NULL);
+ xfs_inode_ag_iterator_flags(mp, xfs_dqrele_inode, flags, NULL,
+ XFS_AGITER_INEW_WAIT);
}
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index 6e4c7446c3d4..96fe209b5eb6 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -221,6 +221,7 @@ void
xfs_cui_release(
struct xfs_cui_log_item *cuip)
{
+ ASSERT(atomic_read(&cuip->cui_refcount) > 0);
if (atomic_dec_and_test(&cuip->cui_refcount)) {
xfs_trans_ail_remove(&cuip->cui_item, SHUTDOWN_LOG_IO_ERROR);
xfs_cui_item_free(cuip);
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 4a84c5ea266d..ffe6fe7a7eb5 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -206,11 +206,7 @@ xfs_reflink_trim_around_shared(
int error = 0;
/* Holes, unwritten, and delalloc extents cannot be shared */
- if (!xfs_is_reflink_inode(ip) ||
- ISUNWRITTEN(irec) ||
- irec->br_startblock == HOLESTARTBLOCK ||
- irec->br_startblock == DELAYSTARTBLOCK ||
- isnullstartblock(irec->br_startblock)) {
+ if (!xfs_is_reflink_inode(ip) || !xfs_bmap_is_real_extent(irec)) {
*shared = false;
return 0;
}
@@ -709,8 +705,22 @@ xfs_reflink_end_cow(
offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count);
- /* Start a rolling transaction to switch the mappings */
- resblks = XFS_EXTENTADD_SPACE_RES(ip->i_mount, XFS_DATA_FORK);
+ /*
+ * Start a rolling transaction to switch the mappings. We're
+ * unlikely ever to have to remap 16T worth of single-block
+ * extents, so just cap the worst case extent count to 2^32-1.
+ * Stick a warning in just in case, and avoid 64-bit division.
+ */
+ BUILD_BUG_ON(MAX_RW_COUNT > UINT_MAX);
+ if (end_fsb - offset_fsb > UINT_MAX) {
+ error = -EFSCORRUPTED;
+ xfs_force_shutdown(ip->i_mount, SHUTDOWN_CORRUPT_INCORE);
+ ASSERT(0);
+ goto out;
+ }
+ resblks = XFS_NEXTENTADD_SPACE_RES(ip->i_mount,
+ (unsigned int)(end_fsb - offset_fsb),
+ XFS_DATA_FORK);
error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_write,
resblks, 0, 0, &tp);
if (error)
@@ -1045,12 +1055,12 @@ xfs_reflink_remap_extent(
xfs_off_t new_isize)
{
struct xfs_mount *mp = ip->i_mount;
+ bool real_extent = xfs_bmap_is_real_extent(irec);
struct xfs_trans *tp;
xfs_fsblock_t firstfsb;
unsigned int resblks;
struct xfs_defer_ops dfops;
struct xfs_bmbt_irec uirec;
- bool real_extent;
xfs_filblks_t rlen;
xfs_filblks_t unmap_len;
xfs_off_t newlen;
@@ -1059,11 +1069,6 @@ xfs_reflink_remap_extent(
unmap_len = irec->br_startoff + irec->br_blockcount - destoff;
trace_xfs_reflink_punch_range(ip, destoff, unmap_len);
- /* Only remap normal extents. */
- real_extent = (irec->br_startblock != HOLESTARTBLOCK &&
- irec->br_startblock != DELAYSTARTBLOCK &&
- !ISUNWRITTEN(irec));
-
/* No reflinking if we're low on space */
if (real_extent) {
error = xfs_reflink_ag_has_free_space(mp,
@@ -1359,9 +1364,7 @@ xfs_reflink_dirty_extents(
goto out;
if (nmaps == 0)
break;
- if (map[0].br_startblock == HOLESTARTBLOCK ||
- map[0].br_startblock == DELAYSTARTBLOCK ||
- ISUNWRITTEN(&map[0]))
+ if (!xfs_bmap_is_real_extent(&map[0]))
goto next;
map[1] = map[0];
@@ -1435,9 +1438,7 @@ xfs_reflink_clear_inode_flag(
return error;
if (nmaps == 0)
break;
- if (map.br_startblock == HOLESTARTBLOCK ||
- map.br_startblock == DELAYSTARTBLOCK ||
- ISUNWRITTEN(&map))
+ if (!xfs_bmap_is_real_extent(&map))
goto next;
agno = XFS_FSB_TO_AGNO(mp, map.br_startblock);
diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index 73c827831551..f3b139c9aa16 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -243,6 +243,7 @@ void
xfs_rui_release(
struct xfs_rui_log_item *ruip)
{
+ ASSERT(atomic_read(&ruip->rui_refcount) > 0);
if (atomic_dec_and_test(&ruip->rui_refcount)) {
xfs_trans_ail_remove(&ruip->rui_item, SHUTDOWN_LOG_IO_ERROR);
xfs_rui_item_free(ruip);
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
index 51dd3c726608..f13133e6f19f 100644
--- a/fs/xfs/xfs_rtalloc.h
+++ b/fs/xfs/xfs_rtalloc.h
@@ -23,6 +23,16 @@
struct xfs_mount;
struct xfs_trans;
+struct xfs_rtalloc_rec {
+ xfs_rtblock_t ar_startblock;
+ xfs_rtblock_t ar_blockcount;
+};
+
+typedef int (*xfs_rtalloc_query_range_fn)(
+ struct xfs_trans *tp,
+ struct xfs_rtalloc_rec *rec,
+ void *priv);
+
#ifdef CONFIG_XFS_RT
/*
* Function prototypes for exported functions.
@@ -118,13 +128,21 @@ int xfs_rtmodify_summary(struct xfs_mount *mp, struct xfs_trans *tp, int log,
int xfs_rtfree_range(struct xfs_mount *mp, struct xfs_trans *tp,
xfs_rtblock_t start, xfs_extlen_t len,
struct xfs_buf **rbpp, xfs_fsblock_t *rsb);
-
-
+int xfs_rtalloc_query_range(struct xfs_trans *tp,
+ struct xfs_rtalloc_rec *low_rec,
+ struct xfs_rtalloc_rec *high_rec,
+ xfs_rtalloc_query_range_fn fn,
+ void *priv);
+int xfs_rtalloc_query_all(struct xfs_trans *tp,
+ xfs_rtalloc_query_range_fn fn,
+ void *priv);
#else
# define xfs_rtallocate_extent(t,b,min,max,l,f,p,rb) (ENOSYS)
# define xfs_rtfree_extent(t,b,l) (ENOSYS)
# define xfs_rtpick_extent(m,t,l,rb) (ENOSYS)
# define xfs_growfs_rt(mp,in) (ENOSYS)
+# define xfs_rtalloc_query_range(t,l,h,f,p) (ENOSYS)
+# define xfs_rtalloc_query_all(t,f,p) (ENOSYS)
static inline int /* error */
xfs_rtmount_init(
xfs_mount_t *mp) /* file system mount structure */
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 685c042a120f..47d239dcf3f4 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -877,8 +877,15 @@ xfs_init_mount_workqueues(
if (!mp->m_eofblocks_workqueue)
goto out_destroy_log;
+ mp->m_sync_workqueue = alloc_workqueue("xfs-sync/%s", WQ_FREEZABLE, 0,
+ mp->m_fsname);
+ if (!mp->m_sync_workqueue)
+ goto out_destroy_eofb;
+
return 0;
+out_destroy_eofb:
+ destroy_workqueue(mp->m_eofblocks_workqueue);
out_destroy_log:
destroy_workqueue(mp->m_log_workqueue);
out_destroy_reclaim:
@@ -899,6 +906,7 @@ STATIC void
xfs_destroy_mount_workqueues(
struct xfs_mount *mp)
{
+ destroy_workqueue(mp->m_sync_workqueue);
destroy_workqueue(mp->m_eofblocks_workqueue);
destroy_workqueue(mp->m_log_workqueue);
destroy_workqueue(mp->m_reclaim_workqueue);
diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c
index 7f17ae6d709a..5d95fe348294 100644
--- a/fs/xfs/xfs_trace.c
+++ b/fs/xfs/xfs_trace.c
@@ -47,6 +47,7 @@
#include "xfs_inode_item.h"
#include "xfs_bmap_btree.h"
#include "xfs_filestream.h"
+#include "xfs_fsmap.h"
/*
* We include this last to have the helpers above available for the trace
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 383ac227ce2c..7c5a16528d8b 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -40,6 +40,8 @@ struct xfs_inode_log_format;
struct xfs_bmbt_irec;
struct xfs_btree_cur;
struct xfs_refcount_irec;
+struct xfs_fsmap;
+struct xfs_rmap_irec;
DECLARE_EVENT_CLASS(xfs_attr_list_class,
TP_PROTO(struct xfs_attr_list_context *ctx),
@@ -2190,7 +2192,7 @@ DECLARE_EVENT_CLASS(xfs_discard_class,
__entry->agbno = agbno;
__entry->len = len;
),
- TP_printk("dev %d:%d agno %u agbno %u len %u\n",
+ TP_printk("dev %d:%d agno %u agbno %u len %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
__entry->agbno,
@@ -2253,8 +2255,8 @@ DECLARE_EVENT_CLASS(xfs_defer_class,
TP_STRUCT__entry(
__field(dev_t, dev)
__field(void *, dop)
- __field(bool, committed)
- __field(bool, low)
+ __field(char, committed)
+ __field(char, low)
),
TP_fast_assign(
__entry->dev = mp ? mp->m_super->s_dev : 0;
@@ -2262,7 +2264,7 @@ DECLARE_EVENT_CLASS(xfs_defer_class,
__entry->committed = dop->dop_committed;
__entry->low = dop->dop_low;
),
- TP_printk("dev %d:%d ops %p committed %d low %d\n",
+ TP_printk("dev %d:%d ops %p committed %d low %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->dop,
__entry->committed,
@@ -2279,8 +2281,8 @@ DECLARE_EVENT_CLASS(xfs_defer_error_class,
TP_STRUCT__entry(
__field(dev_t, dev)
__field(void *, dop)
- __field(bool, committed)
- __field(bool, low)
+ __field(char, committed)
+ __field(char, low)
__field(int, error)
),
TP_fast_assign(
@@ -2290,7 +2292,7 @@ DECLARE_EVENT_CLASS(xfs_defer_error_class,
__entry->low = dop->dop_low;
__entry->error = error;
),
- TP_printk("dev %d:%d ops %p committed %d low %d err %d\n",
+ TP_printk("dev %d:%d ops %p committed %d low %d err %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->dop,
__entry->committed,
@@ -2309,7 +2311,7 @@ DECLARE_EVENT_CLASS(xfs_defer_pending_class,
__field(dev_t, dev)
__field(int, type)
__field(void *, intent)
- __field(bool, committed)
+ __field(char, committed)
__field(int, nr)
),
TP_fast_assign(
@@ -2319,7 +2321,7 @@ DECLARE_EVENT_CLASS(xfs_defer_pending_class,
__entry->committed = dfp->dfp_done != NULL;
__entry->nr = dfp->dfp_count;
),
- TP_printk("dev %d:%d optype %d intent %p committed %d nr %d\n",
+ TP_printk("dev %d:%d optype %d intent %p committed %d nr %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->type,
__entry->intent,
@@ -2614,7 +2616,8 @@ DECLARE_EVENT_CLASS(xfs_ag_resv_class,
__entry->asked = r ? r->ar_asked : 0;
__entry->len = len;
),
- TP_printk("dev %d:%d agno %u resv %d freeblks %u flcount %u resv %u ask %u len %u\n",
+ TP_printk("dev %d:%d agno %u resv %d freeblks %u flcount %u "
+ "resv %u ask %u len %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
__entry->resv,
@@ -2667,7 +2670,7 @@ DECLARE_EVENT_CLASS(xfs_ag_btree_lookup_class,
__entry->agbno = agbno;
__entry->dir = dir;
),
- TP_printk("dev %d:%d agno %u agbno %u cmp %s(%d)\n",
+ TP_printk("dev %d:%d agno %u agbno %u cmp %s(%d)",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
__entry->agbno,
@@ -2700,7 +2703,7 @@ DECLARE_EVENT_CLASS(xfs_refcount_extent_class,
__entry->blockcount = irec->rc_blockcount;
__entry->refcount = irec->rc_refcount;
),
- TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u\n",
+ TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
__entry->startblock,
@@ -2735,7 +2738,7 @@ DECLARE_EVENT_CLASS(xfs_refcount_extent_at_class,
__entry->refcount = irec->rc_refcount;
__entry->agbno = agbno;
),
- TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u @ agbno %u\n",
+ TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u @ agbno %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
__entry->startblock,
@@ -2776,7 +2779,7 @@ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_class,
__entry->i2_refcount = i2->rc_refcount;
),
TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u -- "
- "agbno %u len %u refcount %u\n",
+ "agbno %u len %u refcount %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
__entry->i1_startblock,
@@ -2822,7 +2825,7 @@ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_at_class,
__entry->agbno = agbno;
),
TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u -- "
- "agbno %u len %u refcount %u @ agbno %u\n",
+ "agbno %u len %u refcount %u @ agbno %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
__entry->i1_startblock,
@@ -2875,7 +2878,7 @@ DECLARE_EVENT_CLASS(xfs_refcount_triple_extent_class,
),
TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u -- "
"agbno %u len %u refcount %u -- "
- "agbno %u len %u refcount %u\n",
+ "agbno %u len %u refcount %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
__entry->i1_startblock,
@@ -3001,31 +3004,6 @@ DEFINE_EVENT(xfs_inode_error_class, name, \
unsigned long caller_ip), \
TP_ARGS(ip, error, caller_ip))
-/* reflink allocator */
-TRACE_EVENT(xfs_bmap_remap_alloc,
- TP_PROTO(struct xfs_inode *ip, xfs_fsblock_t fsbno,
- xfs_extlen_t len),
- TP_ARGS(ip, fsbno, len),
- TP_STRUCT__entry(
- __field(dev_t, dev)
- __field(xfs_ino_t, ino)
- __field(xfs_fsblock_t, fsbno)
- __field(xfs_extlen_t, len)
- ),
- TP_fast_assign(
- __entry->dev = VFS_I(ip)->i_sb->s_dev;
- __entry->ino = ip->i_ino;
- __entry->fsbno = fsbno;
- __entry->len = len;
- ),
- TP_printk("dev %d:%d ino 0x%llx fsbno 0x%llx len %x",
- MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->ino,
- __entry->fsbno,
- __entry->len)
-);
-DEFINE_INODE_ERROR_EVENT(xfs_bmap_remap_alloc_error);
-
/* reflink tracepoint classes */
/* two-file io tracepoint class */
@@ -3227,7 +3205,7 @@ TRACE_EVENT(xfs_ioctl_clone,
),
TP_printk("dev %d:%d "
"ino 0x%lx isize 0x%llx -> "
- "ino 0x%lx isize 0x%llx\n",
+ "ino 0x%lx isize 0x%llx",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->src_ino,
__entry->src_isize,
@@ -3267,6 +3245,88 @@ DEFINE_INODE_IREC_EVENT(xfs_swap_extent_rmap_remap);
DEFINE_INODE_IREC_EVENT(xfs_swap_extent_rmap_remap_piece);
DEFINE_INODE_ERROR_EVENT(xfs_swap_extent_rmap_error);
+/* fsmap traces */
+DECLARE_EVENT_CLASS(xfs_fsmap_class,
+ TP_PROTO(struct xfs_mount *mp, u32 keydev, xfs_agnumber_t agno,
+ struct xfs_rmap_irec *rmap),
+ TP_ARGS(mp, keydev, agno, rmap),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(dev_t, keydev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_fsblock_t, bno)
+ __field(xfs_filblks_t, len)
+ __field(__uint64_t, owner)
+ __field(__uint64_t, offset)
+ __field(unsigned int, flags)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->keydev = new_decode_dev(keydev);
+ __entry->agno = agno;
+ __entry->bno = rmap->rm_startblock;
+ __entry->len = rmap->rm_blockcount;
+ __entry->owner = rmap->rm_owner;
+ __entry->offset = rmap->rm_offset;
+ __entry->flags = rmap->rm_flags;
+ ),
+ TP_printk("dev %d:%d keydev %d:%d agno %u bno %llu len %llu owner %lld offset %llu flags 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ MAJOR(__entry->keydev), MINOR(__entry->keydev),
+ __entry->agno,
+ __entry->bno,
+ __entry->len,
+ __entry->owner,
+ __entry->offset,
+ __entry->flags)
+)
+#define DEFINE_FSMAP_EVENT(name) \
+DEFINE_EVENT(xfs_fsmap_class, name, \
+ TP_PROTO(struct xfs_mount *mp, u32 keydev, xfs_agnumber_t agno, \
+ struct xfs_rmap_irec *rmap), \
+ TP_ARGS(mp, keydev, agno, rmap))
+DEFINE_FSMAP_EVENT(xfs_fsmap_low_key);
+DEFINE_FSMAP_EVENT(xfs_fsmap_high_key);
+DEFINE_FSMAP_EVENT(xfs_fsmap_mapping);
+
+DECLARE_EVENT_CLASS(xfs_getfsmap_class,
+ TP_PROTO(struct xfs_mount *mp, struct xfs_fsmap *fsmap),
+ TP_ARGS(mp, fsmap),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(dev_t, keydev)
+ __field(xfs_daddr_t, block)
+ __field(xfs_daddr_t, len)
+ __field(__uint64_t, owner)
+ __field(__uint64_t, offset)
+ __field(__uint64_t, flags)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->keydev = new_decode_dev(fsmap->fmr_device);
+ __entry->block = fsmap->fmr_physical;
+ __entry->len = fsmap->fmr_length;
+ __entry->owner = fsmap->fmr_owner;
+ __entry->offset = fsmap->fmr_offset;
+ __entry->flags = fsmap->fmr_flags;
+ ),
+ TP_printk("dev %d:%d keydev %d:%d block %llu len %llu owner %lld offset %llu flags 0x%llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ MAJOR(__entry->keydev), MINOR(__entry->keydev),
+ __entry->block,
+ __entry->len,
+ __entry->owner,
+ __entry->offset,
+ __entry->flags)
+)
+#define DEFINE_GETFSMAP_EVENT(name) \
+DEFINE_EVENT(xfs_getfsmap_class, name, \
+ TP_PROTO(struct xfs_mount *mp, struct xfs_fsmap *fsmap), \
+ TP_ARGS(mp, fsmap))
+DEFINE_GETFSMAP_EVENT(xfs_getfsmap_low_key);
+DEFINE_GETFSMAP_EVENT(xfs_getfsmap_high_key);
+DEFINE_GETFSMAP_EVENT(xfs_getfsmap_mapping);
+
#endif /* _TRACE_XFS_H */
#undef TRACE_INCLUDE_PATH
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 70f42ea86dfb..2011620008de 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -134,7 +134,7 @@ xfs_trans_reserve(
bool rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
/* Mark this thread as being in a transaction */
- current_set_flags_nested(&tp->t_pflags, PF_FSTRANS);
+ current_set_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
/*
* Attempt to reserve the needed disk blocks by decrementing
@@ -144,7 +144,7 @@ xfs_trans_reserve(
if (blocks > 0) {
error = xfs_mod_fdblocks(tp->t_mountp, -((int64_t)blocks), rsvd);
if (error != 0) {
- current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
+ current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
return -ENOSPC;
}
tp->t_blk_res += blocks;
@@ -221,7 +221,7 @@ undo_blocks:
tp->t_blk_res = 0;
}
- current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
+ current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
return error;
}
@@ -263,6 +263,28 @@ xfs_trans_alloc(
}
/*
+ * Create an empty transaction with no reservation. This is a defensive
+ * mechanism for routines that query metadata without actually modifying
+ * them -- if the metadata being queried is somehow cross-linked (think a
+ * btree block pointer that points higher in the tree), we risk deadlock.
+ * However, blocks grabbed as part of a transaction can be re-grabbed.
+ * The verifiers will notice the corrupt block and the operation will fail
+ * back to userspace without deadlocking.
+ *
+ * Note the zero-length reservation; this transaction MUST be cancelled
+ * without any dirty data.
+ */
+int
+xfs_trans_alloc_empty(
+ struct xfs_mount *mp,
+ struct xfs_trans **tpp)
+{
+ struct xfs_trans_res resv = {0};
+
+ return xfs_trans_alloc(mp, &resv, 0, 0, XFS_TRANS_NO_WRITECOUNT, tpp);
+}
+
+/*
* Record the indicated change to the given field for application
* to the file system's superblock when the transaction commits.
* For now, just store the change in the transaction structure.
@@ -914,7 +936,7 @@ __xfs_trans_commit(
xfs_log_commit_cil(mp, tp, &commit_lsn, regrant);
- current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
+ current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
xfs_trans_free(tp);
/*
@@ -944,7 +966,7 @@ out_unreserve:
if (commit_lsn == -1 && !error)
error = -EIO;
}
- current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
+ current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
xfs_trans_free_items(tp, NULLCOMMITLSN, !!error);
xfs_trans_free(tp);
@@ -998,7 +1020,7 @@ xfs_trans_cancel(
xfs_log_done(mp, tp->t_ticket, NULL, false);
/* mark this thread as no longer being in a transaction */
- current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
+ current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
xfs_trans_free_items(tp, NULLCOMMITLSN, dirty);
xfs_trans_free(tp);
@@ -1012,17 +1034,14 @@ xfs_trans_cancel(
* chunk we've been working on and get a new transaction to continue.
*/
int
-__xfs_trans_roll(
+xfs_trans_roll(
struct xfs_trans **tpp,
- struct xfs_inode *dp,
- int *committed)
+ struct xfs_inode *dp)
{
struct xfs_trans *trans;
struct xfs_trans_res tres;
int error;
- *committed = 0;
-
/*
* Ensure that the inode is always logged.
*/
@@ -1048,7 +1067,6 @@ __xfs_trans_roll(
if (error)
return error;
- *committed = 1;
trans = *tpp;
/*
@@ -1071,12 +1089,3 @@ __xfs_trans_roll(
xfs_trans_ijoin(trans, dp, 0);
return 0;
}
-
-int
-xfs_trans_roll(
- struct xfs_trans **tpp,
- struct xfs_inode *dp)
-{
- int committed;
- return __xfs_trans_roll(tpp, dp, &committed);
-}
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 1646f659b60f..a07acbf0bd8a 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -158,6 +158,8 @@ typedef struct xfs_trans {
int xfs_trans_alloc(struct xfs_mount *mp, struct xfs_trans_res *resp,
uint blocks, uint rtextents, uint flags,
struct xfs_trans **tpp);
+int xfs_trans_alloc_empty(struct xfs_mount *mp,
+ struct xfs_trans **tpp);
void xfs_trans_mod_sb(xfs_trans_t *, uint, int64_t);
struct xfs_buf *xfs_trans_get_buf_map(struct xfs_trans *tp,
@@ -226,7 +228,6 @@ int xfs_trans_free_extent(struct xfs_trans *,
struct xfs_efd_log_item *, xfs_fsblock_t,
xfs_extlen_t, struct xfs_owner_info *);
int xfs_trans_commit(struct xfs_trans *);
-int __xfs_trans_roll(struct xfs_trans **, struct xfs_inode *, int *);
int xfs_trans_roll(struct xfs_trans **, struct xfs_inode *);
void xfs_trans_cancel(xfs_trans_t *);
int xfs_trans_ail_init(struct xfs_mount *);
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index d6c9c3e9e02b..9056c0f34a3c 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -684,8 +684,23 @@ xfs_trans_ail_update_bulk(
}
}
-/*
- * xfs_trans_ail_delete_bulk - remove multiple log items from the AIL
+bool
+xfs_ail_delete_one(
+ struct xfs_ail *ailp,
+ struct xfs_log_item *lip)
+{
+ struct xfs_log_item *mlip = xfs_ail_min(ailp);
+
+ trace_xfs_ail_delete(lip, mlip->li_lsn, lip->li_lsn);
+ xfs_ail_delete(ailp, lip);
+ lip->li_flags &= ~XFS_LI_IN_AIL;
+ lip->li_lsn = 0;
+
+ return mlip == lip;
+}
+
+/**
+ * Remove a log items from the AIL
*
* @xfs_trans_ail_delete_bulk takes an array of log items that all need to
* removed from the AIL. The caller is already holding the AIL lock, and done
@@ -706,52 +721,36 @@ xfs_trans_ail_update_bulk(
* before returning.
*/
void
-xfs_trans_ail_delete_bulk(
+xfs_trans_ail_delete(
struct xfs_ail *ailp,
- struct xfs_log_item **log_items,
- int nr_items,
+ struct xfs_log_item *lip,
int shutdown_type) __releases(ailp->xa_lock)
{
- xfs_log_item_t *mlip;
- int mlip_changed = 0;
- int i;
+ struct xfs_mount *mp = ailp->xa_mount;
+ bool mlip_changed;
- mlip = xfs_ail_min(ailp);
-
- for (i = 0; i < nr_items; i++) {
- struct xfs_log_item *lip = log_items[i];
- if (!(lip->li_flags & XFS_LI_IN_AIL)) {
- struct xfs_mount *mp = ailp->xa_mount;
-
- spin_unlock(&ailp->xa_lock);
- if (!XFS_FORCED_SHUTDOWN(mp)) {
- xfs_alert_tag(mp, XFS_PTAG_AILDELETE,
- "%s: attempting to delete a log item that is not in the AIL",
- __func__);
- xfs_force_shutdown(mp, shutdown_type);
- }
- return;
+ if (!(lip->li_flags & XFS_LI_IN_AIL)) {
+ spin_unlock(&ailp->xa_lock);
+ if (!XFS_FORCED_SHUTDOWN(mp)) {
+ xfs_alert_tag(mp, XFS_PTAG_AILDELETE,
+ "%s: attempting to delete a log item that is not in the AIL",
+ __func__);
+ xfs_force_shutdown(mp, shutdown_type);
}
-
- trace_xfs_ail_delete(lip, mlip->li_lsn, lip->li_lsn);
- xfs_ail_delete(ailp, lip);
- lip->li_flags &= ~XFS_LI_IN_AIL;
- lip->li_lsn = 0;
- if (mlip == lip)
- mlip_changed = 1;
+ return;
}
+ mlip_changed = xfs_ail_delete_one(ailp, lip);
if (mlip_changed) {
- if (!XFS_FORCED_SHUTDOWN(ailp->xa_mount))
- xlog_assign_tail_lsn_locked(ailp->xa_mount);
+ if (!XFS_FORCED_SHUTDOWN(mp))
+ xlog_assign_tail_lsn_locked(mp);
if (list_empty(&ailp->xa_ail))
wake_up_all(&ailp->xa_empty);
- spin_unlock(&ailp->xa_lock);
+ }
+ spin_unlock(&ailp->xa_lock);
+ if (mlip_changed)
xfs_log_space_wake(ailp->xa_mount);
- } else {
- spin_unlock(&ailp->xa_lock);
- }
}
int
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index 49931b72da8a..d91706c56c63 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -106,18 +106,9 @@ xfs_trans_ail_update(
xfs_trans_ail_update_bulk(ailp, NULL, &lip, 1, lsn);
}
-void xfs_trans_ail_delete_bulk(struct xfs_ail *ailp,
- struct xfs_log_item **log_items, int nr_items,
- int shutdown_type)
- __releases(ailp->xa_lock);
-static inline void
-xfs_trans_ail_delete(
- struct xfs_ail *ailp,
- xfs_log_item_t *lip,
- int shutdown_type) __releases(ailp->xa_lock)
-{
- xfs_trans_ail_delete_bulk(ailp, &lip, 1, shutdown_type);
-}
+bool xfs_ail_delete_one(struct xfs_ail *ailp, struct xfs_log_item *lip);
+void xfs_trans_ail_delete(struct xfs_ail *ailp, struct xfs_log_item *lip,
+ int shutdown_type) __releases(ailp->xa_lock);
static inline void
xfs_trans_ail_remove(