summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/v9fs.c21
-rw-r--r--fs/9p/v9fs.h1
-rw-r--r--fs/9p/vfs_inode.c126
-rw-r--r--fs/9p/vfs_super.c39
-rw-r--r--fs/Kconfig3
-rw-r--r--fs/afs/file.c18
-rw-r--r--fs/afs/flock.c2
-rw-r--r--fs/afs/write.c1
-rw-r--r--fs/aio.c47
-rw-r--r--fs/autofs/dirhash.c2
-rw-r--r--fs/autofs4/expire.c2
-rw-r--r--fs/befs/linuxvfs.c2
-rw-r--r--fs/binfmt_elf.c72
-rw-r--r--fs/binfmt_elf_fdpic.c56
-rw-r--r--fs/block_dev.c32
-rw-r--r--fs/btrfs/disk-io.c4
-rw-r--r--fs/btrfs/extent-tree.c3
-rw-r--r--fs/btrfs/inode.c49
-rw-r--r--fs/btrfs/ordered-data.c1
-rw-r--r--fs/btrfs/super.c4
-rw-r--r--fs/btrfs/tree-log.c2
-rw-r--r--fs/btrfs/volumes.c4
-rw-r--r--fs/buffer.c9
-rw-r--r--fs/char_dev.c40
-rw-r--r--fs/cifs/CHANGES5
-rw-r--r--fs/cifs/cifs_dfs_ref.c4
-rw-r--r--fs/cifs/cifs_spnego.c2
-rw-r--r--fs/cifs/cifsacl.c4
-rw-r--r--fs/cifs/cifsencrypt.c1
-rw-r--r--fs/cifs/cifsfs.c26
-rw-r--r--fs/cifs/cifsfs.h4
-rw-r--r--fs/cifs/cifsglob.h21
-rw-r--r--fs/cifs/cifssmb.c316
-rw-r--r--fs/cifs/connect.c49
-rw-r--r--fs/cifs/dir.c2
-rw-r--r--fs/cifs/file.c43
-rw-r--r--fs/cifs/inode.c6
-rw-r--r--fs/cifs/transport.c17
-rw-r--r--fs/compat.c17
-rw-r--r--fs/configfs/inode.c1
-rw-r--r--fs/dcache.c1
-rw-r--r--fs/dlm/lowcomms.c26
-rw-r--r--fs/dlm/netlink.c2
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h2
-rw-r--r--fs/ecryptfs/mmap.c2
-rw-r--r--fs/exec.c73
-rw-r--r--fs/ext2/acl.c8
-rw-r--r--fs/ext2/acl.h4
-rw-r--r--fs/ext2/file.c2
-rw-r--r--fs/ext2/inode.c2
-rw-r--r--fs/ext2/namei.c8
-rw-r--r--fs/ext2/xip.c2
-rw-r--r--fs/ext3/Kconfig32
-rw-r--r--fs/ext3/acl.c8
-rw-r--r--fs/ext3/acl.h4
-rw-r--r--fs/ext3/file.c63
-rw-r--r--fs/ext3/fsync.c12
-rw-r--r--fs/ext3/inode.c28
-rw-r--r--fs/ext3/namei.c4
-rw-r--r--fs/ext3/super.c44
-rw-r--r--fs/ext4/Kconfig11
-rw-r--r--fs/ext4/acl.c8
-rw-r--r--fs/ext4/acl.h4
-rw-r--r--fs/ext4/balloc.c2
-rw-r--r--fs/ext4/ext4.h91
-rw-r--r--fs/ext4/ext4_extents.h4
-rw-r--r--fs/ext4/ext4_jbd2.c9
-rw-r--r--fs/ext4/extents.c112
-rw-r--r--fs/ext4/file.c55
-rw-r--r--fs/ext4/fsync.c13
-rw-r--r--fs/ext4/ialloc.c2
-rw-r--r--fs/ext4/inode.c152
-rw-r--r--fs/ext4/ioctl.c7
-rw-r--r--fs/ext4/mballoc.c429
-rw-r--r--fs/ext4/mballoc.h22
-rw-r--r--fs/ext4/migrate.c22
-rw-r--r--fs/ext4/move_extent.c334
-rw-r--r--fs/ext4/namei.c26
-rw-r--r--fs/ext4/resize.c7
-rw-r--r--fs/ext4/super.c159
-rw-r--r--fs/ext4/xattr.c15
-rw-r--r--fs/fat/file.c22
-rw-r--r--fs/fat/misc.c4
-rw-r--r--fs/fs-writeback.c1104
-rw-r--r--fs/fuse/control.c138
-rw-r--r--fs/fuse/dev.c10
-rw-r--r--fs/fuse/fuse_i.h18
-rw-r--r--fs/fuse/inode.c83
-rw-r--r--fs/gfs2/Makefile2
-rw-r--r--fs/gfs2/acl.c106
-rw-r--r--fs/gfs2/dentry.c18
-rw-r--r--fs/gfs2/eaops.c157
-rw-r--r--fs/gfs2/eaops.h30
-rw-r--r--fs/gfs2/export.c36
-rw-r--r--fs/gfs2/file.c1
-rw-r--r--fs/gfs2/incore.h15
-rw-r--r--fs/gfs2/inode.c159
-rw-r--r--fs/gfs2/ops_fstype.c66
-rw-r--r--fs/gfs2/ops_inode.c82
-rw-r--r--fs/gfs2/rgrp.c90
-rw-r--r--fs/gfs2/rgrp.h6
-rw-r--r--fs/gfs2/super.c46
-rw-r--r--fs/gfs2/super.h5
-rw-r--r--fs/gfs2/sys.c49
-rw-r--r--fs/gfs2/util.c41
-rw-r--r--fs/gfs2/xattr.c (renamed from fs/gfs2/eattr.c)425
-rw-r--r--fs/gfs2/xattr.h (renamed from fs/gfs2/eattr.h)54
-rw-r--r--fs/hugetlbfs/inode.c40
-rw-r--r--fs/inode.c20
-rw-r--r--fs/jbd/checkpoint.c6
-rw-r--r--fs/jbd/commit.c2
-rw-r--r--fs/jbd/journal.c30
-rw-r--r--fs/jbd/recovery.c18
-rw-r--r--fs/jbd/revoke.c16
-rw-r--r--fs/jbd/transaction.c9
-rw-r--r--fs/jbd2/commit.c12
-rw-r--r--fs/jbd2/journal.c6
-rw-r--r--fs/jbd2/transaction.c7
-rw-r--r--fs/jffs2/acl.c7
-rw-r--r--fs/jffs2/acl.h4
-rw-r--r--fs/jffs2/dir.c2
-rw-r--r--fs/jffs2/file.c2
-rw-r--r--fs/jffs2/super.c2
-rw-r--r--fs/jffs2/symlink.c2
-rw-r--r--fs/jffs2/wbuf.c10
-rw-r--r--fs/jfs/acl.c7
-rw-r--r--fs/jfs/file.c2
-rw-r--r--fs/jfs/jfs_acl.h2
-rw-r--r--fs/jfs/namei.c2
-rw-r--r--fs/libfs.c2
-rw-r--r--fs/lockd/clntlock.c2
-rw-r--r--fs/lockd/clntproc.c2
-rw-r--r--fs/lockd/host.c18
-rw-r--r--fs/lockd/mon.c46
-rw-r--r--fs/lockd/svclock.c2
-rw-r--r--fs/lockd/svcsubs.c2
-rw-r--r--fs/locks.c6
-rw-r--r--fs/namei.c110
-rw-r--r--fs/nfs/Makefile3
-rw-r--r--fs/nfs/cache_lib.c140
-rw-r--r--fs/nfs/cache_lib.h27
-rw-r--r--fs/nfs/callback.c26
-rw-r--r--fs/nfs/callback_xdr.c2
-rw-r--r--fs/nfs/client.c26
-rw-r--r--fs/nfs/direct.c23
-rw-r--r--fs/nfs/dns_resolve.c335
-rw-r--r--fs/nfs/dns_resolve.h14
-rw-r--r--fs/nfs/file.c49
-rw-r--r--fs/nfs/idmap.c6
-rw-r--r--fs/nfs/inode.c100
-rw-r--r--fs/nfs/internal.h39
-rw-r--r--fs/nfs/mount_clnt.c83
-rw-r--r--fs/nfs/nfs3proc.c1
-rw-r--r--fs/nfs/nfs4namespace.c24
-rw-r--r--fs/nfs/nfs4proc.c40
-rw-r--r--fs/nfs/nfs4state.c6
-rw-r--r--fs/nfs/nfs4xdr.c1460
-rw-r--r--fs/nfs/read.c6
-rw-r--r--fs/nfs/super.c455
-rw-r--r--fs/nfs/write.c98
-rw-r--r--fs/nfsd/auth.c4
-rw-r--r--fs/nfsd/export.c16
-rw-r--r--fs/nfsd/nfs3xdr.c75
-rw-r--r--fs/nfsd/nfs4acl.c4
-rw-r--r--fs/nfsd/nfs4callback.c263
-rw-r--r--fs/nfsd/nfs4idmap.c20
-rw-r--r--fs/nfsd/nfs4proc.c89
-rw-r--r--fs/nfsd/nfs4state.c685
-rw-r--r--fs/nfsd/nfs4xdr.c42
-rw-r--r--fs/nfsd/nfsctl.c29
-rw-r--r--fs/nfsd/nfsfh.c158
-rw-r--r--fs/nfsd/nfssvc.c56
-rw-r--r--fs/nfsd/vfs.c12
-rw-r--r--fs/nilfs2/Kconfig2
-rw-r--r--fs/nilfs2/bmap.c151
-rw-r--r--fs/nilfs2/bmap.h76
-rw-r--r--fs/nilfs2/btnode.c4
-rw-r--r--fs/nilfs2/btree.c625
-rw-r--r--fs/nilfs2/cpfile.c11
-rw-r--r--fs/nilfs2/cpfile.h2
-rw-r--r--fs/nilfs2/dat.c42
-rw-r--r--fs/nilfs2/dat.h8
-rw-r--r--fs/nilfs2/direct.c161
-rw-r--r--fs/nilfs2/file.c2
-rw-r--r--fs/nilfs2/gcinode.c2
-rw-r--r--fs/nilfs2/ifile.h1
-rw-r--r--fs/nilfs2/inode.c5
-rw-r--r--fs/nilfs2/ioctl.c26
-rw-r--r--fs/nilfs2/mdt.c44
-rw-r--r--fs/nilfs2/mdt.h3
-rw-r--r--fs/nilfs2/namei.c6
-rw-r--r--fs/nilfs2/nilfs.h10
-rw-r--r--fs/nilfs2/recovery.c3
-rw-r--r--fs/nilfs2/segbuf.c4
-rw-r--r--fs/nilfs2/segment.c7
-rw-r--r--fs/nilfs2/sufile.h1
-rw-r--r--fs/nilfs2/super.c106
-rw-r--r--fs/nilfs2/the_nilfs.c19
-rw-r--r--fs/nilfs2/the_nilfs.h45
-rw-r--r--fs/notify/inotify/inotify_fsnotify.c46
-rw-r--r--fs/notify/inotify/inotify_user.c254
-rw-r--r--fs/notify/notification.c11
-rw-r--r--fs/ntfs/file.c16
-rw-r--r--fs/ntfs/layout.h2
-rw-r--r--fs/ntfs/malloc.h2
-rw-r--r--fs/ntfs/mft.c13
-rw-r--r--fs/ocfs2/alloc.c49
-rw-r--r--fs/ocfs2/aops.c69
-rw-r--r--fs/ocfs2/dcache.c46
-rw-r--r--fs/ocfs2/dcache.h3
-rw-r--r--fs/ocfs2/dlm/dlmast.c1
-rw-r--r--fs/ocfs2/dlm/dlmfs.c1
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c2
-rw-r--r--fs/ocfs2/dlm/dlmunlock.c4
-rw-r--r--fs/ocfs2/file.c54
-rw-r--r--fs/ocfs2/journal.c8
-rw-r--r--fs/ocfs2/journal.h19
-rw-r--r--fs/ocfs2/ocfs2.h22
-rw-r--r--fs/ocfs2/ocfs2_lockid.h1
-rw-r--r--fs/ocfs2/quota.h3
-rw-r--r--fs/ocfs2/quota_global.c148
-rw-r--r--fs/ocfs2/quota_local.c110
-rw-r--r--fs/ocfs2/stack_o2cb.c3
-rw-r--r--fs/ocfs2/super.c36
-rw-r--r--fs/ocfs2/xattr.c3
-rw-r--r--fs/omfs/dir.c2
-rw-r--r--fs/omfs/file.c4
-rw-r--r--fs/omfs/inode.c2
-rw-r--r--fs/omfs/omfs.h6
-rw-r--r--fs/open.c12
-rw-r--r--fs/partitions/check.c16
-rw-r--r--fs/proc/base.c80
-rw-r--r--fs/proc/kcore.c35
-rw-r--r--fs/proc/meminfo.c4
-rw-r--r--fs/proc/page.c5
-rw-r--r--fs/proc/task_mmu.c29
-rw-r--r--fs/proc/task_nommu.c1
-rw-r--r--fs/quota/dquot.c4
-rw-r--r--fs/ramfs/inode.c1
-rw-r--r--fs/reiserfs/super.c4
-rw-r--r--fs/romfs/super.c2
-rw-r--r--fs/select.c1
-rw-r--r--fs/splice.c30
-rw-r--r--fs/squashfs/super.c4
-rw-r--r--fs/super.c13
-rw-r--r--fs/sync.c85
-rw-r--r--fs/sysfs/dir.c1
-rw-r--r--fs/sysfs/inode.c135
-rw-r--r--fs/sysfs/symlink.c2
-rw-r--r--fs/sysfs/sysfs.h12
-rw-r--r--fs/ubifs/budget.c34
-rw-r--r--fs/ubifs/commit.c2
-rw-r--r--fs/ubifs/debug.c112
-rw-r--r--fs/ubifs/debug.h5
-rw-r--r--fs/ubifs/file.c62
-rw-r--r--fs/ubifs/gc.c2
-rw-r--r--fs/ubifs/io.c29
-rw-r--r--fs/ubifs/journal.c13
-rw-r--r--fs/ubifs/key.h35
-rw-r--r--fs/ubifs/log.c17
-rw-r--r--fs/ubifs/lprops.c43
-rw-r--r--fs/ubifs/master.c20
-rw-r--r--fs/ubifs/orphan.c7
-rw-r--r--fs/ubifs/recovery.c4
-rw-r--r--fs/ubifs/replay.c6
-rw-r--r--fs/ubifs/scan.c32
-rw-r--r--fs/ubifs/super.c41
-rw-r--r--fs/ubifs/tnc.c76
-rw-r--r--fs/ubifs/tnc_commit.c2
-rw-r--r--fs/ubifs/ubifs-media.h7
-rw-r--r--fs/ubifs/ubifs.h13
-rw-r--r--fs/ubifs/xattr.c6
-rw-r--r--fs/udf/directory.c86
-rw-r--r--fs/udf/file.c2
-rw-r--r--fs/udf/inode.c19
-rw-r--r--fs/udf/lowlevel.c4
-rw-r--r--fs/udf/namei.c1
-rw-r--r--fs/xattr.c55
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.c1
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c2
-rw-r--r--fs/xfs/linux-2.6/xfs_file.c19
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl32.c2
-rw-r--r--fs/xfs/linux-2.6/xfs_iops.c17
-rw-r--r--fs/xfs/linux-2.6/xfs_lrw.c7
-rw-r--r--fs/xfs/linux-2.6/xfs_quotaops.c2
-rw-r--r--fs/xfs/linux-2.6/xfs_stats.c51
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c28
-rw-r--r--fs/xfs/linux-2.6/xfs_super.h2
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.c28
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.h2
-rw-r--r--fs/xfs/quota/xfs_qm_stats.c78
-rw-r--r--fs/xfs/xfs_ag.h9
-rw-r--r--fs/xfs/xfs_attr.c8
-rw-r--r--fs/xfs/xfs_bmap.c4
-rw-r--r--fs/xfs/xfs_bmap.h11
-rw-r--r--fs/xfs/xfs_bmap_btree.c20
-rw-r--r--fs/xfs/xfs_bmap_btree.h1
-rw-r--r--fs/xfs/xfs_btree.c46
-rw-r--r--fs/xfs/xfs_btree.h15
-rw-r--r--fs/xfs/xfs_da_btree.c6
-rw-r--r--fs/xfs/xfs_dir2.c2
-rw-r--r--fs/xfs/xfs_fs.h2
-rw-r--r--fs/xfs/xfs_fsops.c20
-rw-r--r--fs/xfs/xfs_ialloc.c805
-rw-r--r--fs/xfs/xfs_ialloc.h18
-rw-r--r--fs/xfs/xfs_iget.c140
-rw-r--r--fs/xfs/xfs_inode.c18
-rw-r--r--fs/xfs/xfs_inode.h8
-rw-r--r--fs/xfs/xfs_inode_item.c10
-rw-r--r--fs/xfs/xfs_inode_item.h2
-rw-r--r--fs/xfs/xfs_inum.h1
-rw-r--r--fs/xfs/xfs_itable.c98
-rw-r--r--fs/xfs/xfs_itable.h5
-rw-r--r--fs/xfs/xfs_log.c2
-rw-r--r--fs/xfs/xfs_log_priv.h2
-rw-r--r--fs/xfs/xfs_log_recover.c2
-rw-r--r--fs/xfs/xfs_mount.c2
-rw-r--r--fs/xfs/xfs_mount.h3
-rw-r--r--fs/xfs/xfs_mru_cache.c29
-rw-r--r--fs/xfs/xfs_mru_cache.h1
-rw-r--r--fs/xfs/xfs_rw.c84
-rw-r--r--fs/xfs/xfs_rw.h7
-rw-r--r--fs/xfs/xfs_trans.h2
-rw-r--r--fs/xfs/xfs_trans_buf.c4
-rw-r--r--fs/xfs/xfs_trans_inode.c86
-rw-r--r--fs/xfs/xfs_vnodeops.c21
326 files changed, 8800 insertions, 6900 deletions
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 332b5ff02fec..f7003cfac63d 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -76,7 +76,7 @@ static const match_table_t tokens = {
* Return 0 upon success, -ERRNO upon failure.
*/
-static int v9fs_parse_options(struct v9fs_session_info *v9ses)
+static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
{
char *options;
substring_t args[MAX_OPT_ARGS];
@@ -90,10 +90,10 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses)
v9ses->debug = 0;
v9ses->cache = 0;
- if (!v9ses->options)
+ if (!opts)
return 0;
- options = kstrdup(v9ses->options, GFP_KERNEL);
+ options = kstrdup(opts, GFP_KERNEL);
if (!options) {
P9_DPRINTK(P9_DEBUG_ERROR,
"failed to allocate copy of option string\n");
@@ -206,24 +206,14 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
v9ses->uid = ~0;
v9ses->dfltuid = V9FS_DEFUID;
v9ses->dfltgid = V9FS_DEFGID;
- if (data) {
- v9ses->options = kstrdup(data, GFP_KERNEL);
- if (!v9ses->options) {
- P9_DPRINTK(P9_DEBUG_ERROR,
- "failed to allocate copy of option string\n");
- retval = -ENOMEM;
- goto error;
- }
- }
- rc = v9fs_parse_options(v9ses);
+ rc = v9fs_parse_options(v9ses, data);
if (rc < 0) {
retval = rc;
goto error;
}
- v9ses->clnt = p9_client_create(dev_name, v9ses->options);
-
+ v9ses->clnt = p9_client_create(dev_name, data);
if (IS_ERR(v9ses->clnt)) {
retval = PTR_ERR(v9ses->clnt);
v9ses->clnt = NULL;
@@ -280,7 +270,6 @@ void v9fs_session_close(struct v9fs_session_info *v9ses)
__putname(v9ses->uname);
__putname(v9ses->aname);
- kfree(v9ses->options);
}
/**
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index a7d567192998..38762bf102a9 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -85,7 +85,6 @@ struct v9fs_session_info {
unsigned int afid;
unsigned int cache;
- char *options; /* copy of mount options */
char *uname; /* user name to mount as */
char *aname; /* name of remote hierarchy being mounted */
unsigned int maxdata; /* max data for client interface */
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 81f8bbf12f9f..06a223d50a81 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -171,7 +171,6 @@ int v9fs_uflags2omode(int uflags, int extended)
/**
* v9fs_blank_wstat - helper function to setup a 9P stat structure
- * @v9ses: 9P session info (for determining extended mode)
* @wstat: structure to initialize
*
*/
@@ -207,65 +206,72 @@ v9fs_blank_wstat(struct p9_wstat *wstat)
struct inode *v9fs_get_inode(struct super_block *sb, int mode)
{
+ int err;
struct inode *inode;
struct v9fs_session_info *v9ses = sb->s_fs_info;
P9_DPRINTK(P9_DEBUG_VFS, "super block: %p mode: %o\n", sb, mode);
inode = new_inode(sb);
- if (inode) {
- inode->i_mode = mode;
- inode->i_uid = current_fsuid();
- inode->i_gid = current_fsgid();
- inode->i_blocks = 0;
- inode->i_rdev = 0;
- inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
- inode->i_mapping->a_ops = &v9fs_addr_operations;
-
- switch (mode & S_IFMT) {
- case S_IFIFO:
- case S_IFBLK:
- case S_IFCHR:
- case S_IFSOCK:
- if (!v9fs_extended(v9ses)) {
- P9_DPRINTK(P9_DEBUG_ERROR,
- "special files without extended mode\n");
- return ERR_PTR(-EINVAL);
- }
- init_special_inode(inode, inode->i_mode,
- inode->i_rdev);
- break;
- case S_IFREG:
- inode->i_op = &v9fs_file_inode_operations;
- inode->i_fop = &v9fs_file_operations;
- break;
- case S_IFLNK:
- if (!v9fs_extended(v9ses)) {
- P9_DPRINTK(P9_DEBUG_ERROR,
- "extended modes used w/o 9P2000.u\n");
- return ERR_PTR(-EINVAL);
- }
- inode->i_op = &v9fs_symlink_inode_operations;
- break;
- case S_IFDIR:
- inc_nlink(inode);
- if (v9fs_extended(v9ses))
- inode->i_op = &v9fs_dir_inode_operations_ext;
- else
- inode->i_op = &v9fs_dir_inode_operations;
- inode->i_fop = &v9fs_dir_operations;
- break;
- default:
- P9_DPRINTK(P9_DEBUG_ERROR,
- "BAD mode 0x%x S_IFMT 0x%x\n",
- mode, mode & S_IFMT);
- return ERR_PTR(-EINVAL);
- }
- } else {
+ if (!inode) {
P9_EPRINTK(KERN_WARNING, "Problem allocating inode\n");
return ERR_PTR(-ENOMEM);
}
+
+ inode->i_mode = mode;
+ inode->i_uid = current_fsuid();
+ inode->i_gid = current_fsgid();
+ inode->i_blocks = 0;
+ inode->i_rdev = 0;
+ inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ inode->i_mapping->a_ops = &v9fs_addr_operations;
+
+ switch (mode & S_IFMT) {
+ case S_IFIFO:
+ case S_IFBLK:
+ case S_IFCHR:
+ case S_IFSOCK:
+ if (!v9fs_extended(v9ses)) {
+ P9_DPRINTK(P9_DEBUG_ERROR,
+ "special files without extended mode\n");
+ err = -EINVAL;
+ goto error;
+ }
+ init_special_inode(inode, inode->i_mode, inode->i_rdev);
+ break;
+ case S_IFREG:
+ inode->i_op = &v9fs_file_inode_operations;
+ inode->i_fop = &v9fs_file_operations;
+ break;
+ case S_IFLNK:
+ if (!v9fs_extended(v9ses)) {
+ P9_DPRINTK(P9_DEBUG_ERROR,
+ "extended modes used w/o 9P2000.u\n");
+ err = -EINVAL;
+ goto error;
+ }
+ inode->i_op = &v9fs_symlink_inode_operations;
+ break;
+ case S_IFDIR:
+ inc_nlink(inode);
+ if (v9fs_extended(v9ses))
+ inode->i_op = &v9fs_dir_inode_operations_ext;
+ else
+ inode->i_op = &v9fs_dir_inode_operations;
+ inode->i_fop = &v9fs_dir_operations;
+ break;
+ default:
+ P9_DPRINTK(P9_DEBUG_ERROR, "BAD mode 0x%x S_IFMT 0x%x\n",
+ mode, mode & S_IFMT);
+ err = -EINVAL;
+ goto error;
+ }
+
return inode;
+
+error:
+ iput(inode);
+ return ERR_PTR(err);
}
/*
@@ -338,30 +344,25 @@ v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
ret = NULL;
st = p9_client_stat(fid);
- if (IS_ERR(st)) {
- err = PTR_ERR(st);
- st = NULL;
- goto error;
- }
+ if (IS_ERR(st))
+ return ERR_CAST(st);
umode = p9mode2unixmode(v9ses, st->mode);
ret = v9fs_get_inode(sb, umode);
if (IS_ERR(ret)) {
err = PTR_ERR(ret);
- ret = NULL;
goto error;
}
v9fs_stat2inode(st, ret, sb);
ret->i_ino = v9fs_qid2ino(&st->qid);
+ p9stat_free(st);
kfree(st);
return ret;
error:
+ p9stat_free(st);
kfree(st);
- if (ret)
- iput(ret);
-
return ERR_PTR(err);
}
@@ -403,9 +404,9 @@ v9fs_open_created(struct inode *inode, struct file *file)
* @v9ses: session information
* @dir: directory that dentry is being created in
* @dentry: dentry that is being created
+ * @extension: 9p2000.u extension string to support devices, etc.
* @perm: create permissions
* @mode: open mode
- * @extension: 9p2000.u extension string to support devices, etc.
*
*/
static struct p9_fid *
@@ -470,7 +471,10 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
dentry->d_op = &v9fs_dentry_operations;
d_instantiate(dentry, inode);
- v9fs_fid_add(dentry, fid);
+ err = v9fs_fid_add(dentry, fid);
+ if (err < 0)
+ goto error;
+
return ofid;
error:
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 38d695d66a0b..8961f1a8f668 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -81,7 +81,7 @@ static int v9fs_set_super(struct super_block *s, void *data)
static void
v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
- int flags)
+ int flags, void *data)
{
sb->s_maxbytes = MAX_LFS_FILESIZE;
sb->s_blocksize_bits = fls(v9ses->maxdata - 1);
@@ -91,6 +91,8 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
sb->s_flags = flags | MS_ACTIVE | MS_SYNCHRONOUS | MS_DIRSYNC |
MS_NOATIME;
+
+ save_mount_options(sb, data);
}
/**
@@ -113,14 +115,11 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
struct v9fs_session_info *v9ses = NULL;
struct p9_wstat *st = NULL;
int mode = S_IRWXUGO | S_ISVTX;
- uid_t uid = current_fsuid();
- gid_t gid = current_fsgid();
struct p9_fid *fid;
int retval = 0;
P9_DPRINTK(P9_DEBUG_VFS, " \n");
- st = NULL;
v9ses = kzalloc(sizeof(struct v9fs_session_info), GFP_KERNEL);
if (!v9ses)
return -ENOMEM;
@@ -142,7 +141,7 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
retval = PTR_ERR(sb);
goto free_stat;
}
- v9fs_fill_super(sb, v9ses, flags);
+ v9fs_fill_super(sb, v9ses, flags, data);
inode = v9fs_get_inode(sb, S_IFDIR | mode);
if (IS_ERR(inode)) {
@@ -150,9 +149,6 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
goto release_sb;
}
- inode->i_uid = uid;
- inode->i_gid = gid;
-
root = d_alloc_root(inode);
if (!root) {
iput(inode);
@@ -173,10 +169,8 @@ P9_DPRINTK(P9_DEBUG_VFS, " simple set mount, return 0\n");
simple_set_mnt(mnt, sb);
return 0;
-release_sb:
- deactivate_locked_super(sb);
-
free_stat:
+ p9stat_free(st);
kfree(st);
clunk_fid:
@@ -185,7 +179,12 @@ clunk_fid:
close_session:
v9fs_session_close(v9ses);
kfree(v9ses);
+ return retval;
+release_sb:
+ p9stat_free(st);
+ kfree(st);
+ deactivate_locked_super(sb);
return retval;
}
@@ -207,24 +206,10 @@ static void v9fs_kill_super(struct super_block *s)
v9fs_session_close(v9ses);
kfree(v9ses);
+ s->s_fs_info = NULL;
P9_DPRINTK(P9_DEBUG_VFS, "exiting kill_super\n");
}
-/**
- * v9fs_show_options - Show mount options in /proc/mounts
- * @m: seq_file to write to
- * @mnt: mount descriptor
- *
- */
-
-static int v9fs_show_options(struct seq_file *m, struct vfsmount *mnt)
-{
- struct v9fs_session_info *v9ses = mnt->mnt_sb->s_fs_info;
-
- seq_printf(m, "%s", v9ses->options);
- return 0;
-}
-
static void
v9fs_umount_begin(struct super_block *sb)
{
@@ -237,7 +222,7 @@ v9fs_umount_begin(struct super_block *sb)
static const struct super_operations v9fs_super_ops = {
.statfs = simple_statfs,
.clear_inode = v9fs_clear_inode,
- .show_options = v9fs_show_options,
+ .show_options = generic_show_options,
.umount_begin = v9fs_umount_begin,
};
diff --git a/fs/Kconfig b/fs/Kconfig
index 0e7da7bb5d93..d4bf8caad8d0 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -43,6 +43,7 @@ source "fs/xfs/Kconfig"
source "fs/gfs2/Kconfig"
source "fs/ocfs2/Kconfig"
source "fs/btrfs/Kconfig"
+source "fs/nilfs2/Kconfig"
endif # BLOCK
@@ -108,6 +109,7 @@ source "fs/sysfs/Kconfig"
config TMPFS
bool "Virtual memory file system support (former shm fs)"
+ depends on SHMEM
help
Tmpfs is a file system which keeps all files in virtual memory.
@@ -186,7 +188,6 @@ source "fs/romfs/Kconfig"
source "fs/sysv/Kconfig"
source "fs/ufs/Kconfig"
source "fs/exofs/Kconfig"
-source "fs/nilfs2/Kconfig"
endif # MISC_FILESYSTEMS
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 0149dab365e7..681c2a7b013f 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -134,9 +134,16 @@ static int afs_readpage(struct file *file, struct page *page)
inode = page->mapping->host;
- ASSERT(file != NULL);
- key = file->private_data;
- ASSERT(key != NULL);
+ if (file) {
+ key = file->private_data;
+ ASSERT(key != NULL);
+ } else {
+ key = afs_request_key(AFS_FS_S(inode->i_sb)->volume->cell);
+ if (IS_ERR(key)) {
+ ret = PTR_ERR(key);
+ goto error_nokey;
+ }
+ }
_enter("{%x},{%lu},{%lu}", key_serial(key), inode->i_ino, page->index);
@@ -207,12 +214,17 @@ static int afs_readpage(struct file *file, struct page *page)
unlock_page(page);
}
+ if (!file)
+ key_put(key);
_leave(" = 0");
return 0;
error:
SetPageError(page);
unlock_page(page);
+ if (!file)
+ key_put(key);
+error_nokey:
_leave(" = %d", ret);
return ret;
}
diff --git a/fs/afs/flock.c b/fs/afs/flock.c
index 3ff8bdd18fb3..0931bc1325eb 100644
--- a/fs/afs/flock.c
+++ b/fs/afs/flock.c
@@ -21,7 +21,7 @@ static void afs_fl_release_private(struct file_lock *fl);
static struct workqueue_struct *afs_lock_manager;
static DEFINE_MUTEX(afs_lock_manager_mutex);
-static struct file_lock_operations afs_lock_ops = {
+static const struct file_lock_operations afs_lock_ops = {
.fl_copy_lock = afs_fl_copy_lock,
.fl_release_private = afs_fl_release_private,
};
diff --git a/fs/afs/write.c b/fs/afs/write.c
index c2e7a7ff0080..c63a3c8beb73 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -712,7 +712,6 @@ int afs_writeback_all(struct afs_vnode *vnode)
.bdi = mapping->backing_dev_info,
.sync_mode = WB_SYNC_ALL,
.nr_to_write = LONG_MAX,
- .for_writepages = 1,
.range_cyclic = 1,
};
int ret;
diff --git a/fs/aio.c b/fs/aio.c
index d065b2c3273e..fc21c23b2387 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -24,6 +24,7 @@
#include <linux/file.h>
#include <linux/mm.h>
#include <linux/mman.h>
+#include <linux/mmu_context.h>
#include <linux/slab.h>
#include <linux/timer.h>
#include <linux/aio.h>
@@ -34,7 +35,6 @@
#include <asm/kmap_types.h>
#include <asm/uaccess.h>
-#include <asm/mmu_context.h>
#if DEBUG > 1
#define dprintk printk
@@ -595,51 +595,6 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id)
}
/*
- * use_mm
- * Makes the calling kernel thread take on the specified
- * mm context.
- * Called by the retry thread execute retries within the
- * iocb issuer's mm context, so that copy_from/to_user
- * operations work seamlessly for aio.
- * (Note: this routine is intended to be called only
- * from a kernel thread context)
- */
-static void use_mm(struct mm_struct *mm)
-{
- struct mm_struct *active_mm;
- struct task_struct *tsk = current;
-
- task_lock(tsk);
- active_mm = tsk->active_mm;
- atomic_inc(&mm->mm_count);
- tsk->mm = mm;
- tsk->active_mm = mm;
- switch_mm(active_mm, mm, tsk);
- task_unlock(tsk);
-
- mmdrop(active_mm);
-}
-
-/*
- * unuse_mm
- * Reverses the effect of use_mm, i.e. releases the
- * specified mm context which was earlier taken on
- * by the calling kernel thread
- * (Note: this routine is intended to be called only
- * from a kernel thread context)
- */
-static void unuse_mm(struct mm_struct *mm)
-{
- struct task_struct *tsk = current;
-
- task_lock(tsk);
- tsk->mm = NULL;
- /* active_mm is still 'mm' */
- enter_lazy_tlb(mm, tsk);
- task_unlock(tsk);
-}
-
-/*
* Queue up a kiocb to be retried. Assumes that the kiocb
* has already been marked as kicked, and places it on
* the retry run list for the corresponding ioctx, if it
diff --git a/fs/autofs/dirhash.c b/fs/autofs/dirhash.c
index 2316e944a109..e947915109e5 100644
--- a/fs/autofs/dirhash.c
+++ b/fs/autofs/dirhash.c
@@ -90,7 +90,7 @@ struct autofs_dir_ent *autofs_expire(struct super_block *sb,
DPRINTK(("autofs: not expirable (not a mounted directory): %s\n", ent->name));
continue;
}
- while (d_mountpoint(path.dentry) && follow_down(&path));
+ while (d_mountpoint(path.dentry) && follow_down(&path))
;
umount_ok = may_umount(path.mnt);
path_put(&path);
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index aa39ae83f019..3da18d453488 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -77,7 +77,7 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
}
/* Update the expiry counter if fs is busy */
- if (!may_umount_tree(mnt)) {
+ if (!may_umount_tree(path.mnt)) {
struct autofs_info *ino = autofs4_dentry_ino(top);
ino->last_used = jiffies;
goto done;
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 615d5496fe0f..dd376c124e71 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -842,7 +842,7 @@ befs_fill_super(struct super_block *sb, void *data, int silent)
sb->s_magic = BEFS_SUPER_MAGIC;
/* Set real blocksize of fs */
sb_set_blocksize(sb, (ulong) befs_sb->block_size);
- sb->s_op = (struct super_operations *) &befs_sops;
+ sb->s_op = &befs_sops;
root = befs_iget(sb, iaddr2blockno(sb, &(befs_sb->root_dir)));
if (IS_ERR(root)) {
ret = PTR_ERR(root);
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index b7c1603cd4bd..442d94fe255c 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -501,22 +501,22 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
}
}
- /*
- * Now fill out the bss section. First pad the last page up
- * to the page boundary, and then perform a mmap to make sure
- * that there are zero-mapped pages up to and including the
- * last bss page.
- */
- if (padzero(elf_bss)) {
- error = -EFAULT;
- goto out_close;
- }
+ if (last_bss > elf_bss) {
+ /*
+ * Now fill out the bss section. First pad the last page up
+ * to the page boundary, and then perform a mmap to make sure
+ * that there are zero-mapped pages up to and including the
+ * last bss page.
+ */
+ if (padzero(elf_bss)) {
+ error = -EFAULT;
+ goto out_close;
+ }
- /* What we have mapped so far */
- elf_bss = ELF_PAGESTART(elf_bss + ELF_MIN_ALIGN - 1);
+ /* What we have mapped so far */
+ elf_bss = ELF_PAGESTART(elf_bss + ELF_MIN_ALIGN - 1);
- /* Map the last of the bss segment */
- if (last_bss > elf_bss) {
+ /* Map the last of the bss segment */
down_write(&current->mm->mmap_sem);
error = do_brk(elf_bss, last_bss - elf_bss);
up_write(&current->mm->mmap_sem);
@@ -1280,9 +1280,6 @@ static int writenote(struct memelfnote *men, struct file *file,
#define DUMP_WRITE(addr, nr) \
if ((size += (nr)) > limit || !dump_write(file, (addr), (nr))) \
goto end_coredump;
-#define DUMP_SEEK(off) \
- if (!dump_seek(file, (off))) \
- goto end_coredump;
static void fill_elf_header(struct elfhdr *elf, int segs,
u16 machine, u32 flags, u8 osabi)
@@ -2016,7 +2013,8 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, un
goto end_coredump;
/* Align to page */
- DUMP_SEEK(dataoff - foffset);
+ if (!dump_seek(file, dataoff - foffset))
+ goto end_coredump;
for (vma = first_vma(current, gate_vma); vma != NULL;
vma = next_vma(vma, gate_vma)) {
@@ -2027,33 +2025,19 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, un
for (addr = vma->vm_start; addr < end; addr += PAGE_SIZE) {
struct page *page;
- struct vm_area_struct *tmp_vma;
-
- if (get_user_pages(current, current->mm, addr, 1, 0, 1,
- &page, &tmp_vma) <= 0) {
- DUMP_SEEK(PAGE_SIZE);
- } else {
- if (page == ZERO_PAGE(0)) {
- if (!dump_seek(file, PAGE_SIZE)) {
- page_cache_release(page);
- goto end_coredump;
- }
- } else {
- void *kaddr;
- flush_cache_page(tmp_vma, addr,
- page_to_pfn(page));
- kaddr = kmap(page);
- if ((size += PAGE_SIZE) > limit ||
- !dump_write(file, kaddr,
- PAGE_SIZE)) {
- kunmap(page);
- page_cache_release(page);
- goto end_coredump;
- }
- kunmap(page);
- }
+ int stop;
+
+ page = get_dump_page(addr);
+ if (page) {
+ void *kaddr = kmap(page);
+ stop = ((size += PAGE_SIZE) > limit) ||
+ !dump_write(file, kaddr, PAGE_SIZE);
+ kunmap(page);
page_cache_release(page);
- }
+ } else
+ stop = !dump_seek(file, PAGE_SIZE);
+ if (stop)
+ goto end_coredump;
}
}
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 20fbeced472b..76285471073e 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -1325,9 +1325,6 @@ static int writenote(struct memelfnote *men, struct file *file)
#define DUMP_WRITE(addr, nr) \
if ((size += (nr)) > limit || !dump_write(file, (addr), (nr))) \
goto end_coredump;
-#define DUMP_SEEK(off) \
- if (!dump_seek(file, (off))) \
- goto end_coredump;
static inline void fill_elf_fdpic_header(struct elfhdr *elf, int segs)
{
@@ -1518,6 +1515,7 @@ static int elf_fdpic_dump_segments(struct file *file, size_t *size,
unsigned long *limit, unsigned long mm_flags)
{
struct vm_area_struct *vma;
+ int err = 0;
for (vma = current->mm->mmap; vma; vma = vma->vm_next) {
unsigned long addr;
@@ -1525,43 +1523,26 @@ static int elf_fdpic_dump_segments(struct file *file, size_t *size,
if (!maydump(vma, mm_flags))
continue;
- for (addr = vma->vm_start;
- addr < vma->vm_end;
- addr += PAGE_SIZE
- ) {
- struct vm_area_struct *vma;
- struct page *page;
-
- if (get_user_pages(current, current->mm, addr, 1, 0, 1,
- &page, &vma) <= 0) {
- DUMP_SEEK(file->f_pos + PAGE_SIZE);
- }
- else if (page == ZERO_PAGE(0)) {
- page_cache_release(page);
- DUMP_SEEK(file->f_pos + PAGE_SIZE);
- }
- else {
- void *kaddr;
-
- flush_cache_page(vma, addr, page_to_pfn(page));
- kaddr = kmap(page);
- if ((*size += PAGE_SIZE) > *limit ||
- !dump_write(file, kaddr, PAGE_SIZE)
- ) {
- kunmap(page);
- page_cache_release(page);
- return -EIO;
- }
+ for (addr = vma->vm_start; addr < vma->vm_end;
+ addr += PAGE_SIZE) {
+ struct page *page = get_dump_page(addr);
+ if (page) {
+ void *kaddr = kmap(page);
+ *size += PAGE_SIZE;
+ if (*size > *limit)
+ err = -EFBIG;
+ else if (!dump_write(file, kaddr, PAGE_SIZE))
+ err = -EIO;
kunmap(page);
page_cache_release(page);
- }
+ } else if (!dump_seek(file, file->f_pos + PAGE_SIZE))
+ err = -EFBIG;
+ if (err)
+ goto out;
}
}
-
- return 0;
-
-end_coredump:
- return -EFBIG;
+out:
+ return err;
}
#endif
@@ -1802,7 +1783,8 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs,
goto end_coredump;
}
- DUMP_SEEK(dataoff);
+ if (!dump_seek(file, dataoff))
+ goto end_coredump;
if (elf_fdpic_dump_segments(file, &size, &limit, mm_flags) < 0)
goto end_coredump;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 94dfda24c06e..5d1ed50bd46c 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -420,7 +420,6 @@ static void bdev_destroy_inode(struct inode *inode)
{
struct bdev_inode *bdi = BDEV_I(inode);
- bdi->bdev.bd_inode_backing_dev_info = NULL;
kmem_cache_free(bdev_cachep, bdi);
}
@@ -1115,7 +1114,7 @@ EXPORT_SYMBOL(revalidate_disk);
int check_disk_change(struct block_device *bdev)
{
struct gendisk *disk = bdev->bd_disk;
- struct block_device_operations * bdops = disk->fops;
+ const struct block_device_operations *bdops = disk->fops;
if (!bdops->media_changed)
return 0;
@@ -1405,6 +1404,33 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
}
/*
+ * Write data to the block device. Only intended for the block device itself
+ * and the raw driver which basically is a fake block device.
+ *
+ * Does not take i_mutex for the write and thus is not for general purpose
+ * use.
+ */
+ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos)
+{
+ struct file *file = iocb->ki_filp;
+ ssize_t ret;
+
+ BUG_ON(iocb->ki_pos != pos);
+
+ ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
+ if (ret > 0 || ret == -EIOCBQUEUED) {
+ ssize_t err;
+
+ err = generic_write_sync(file, pos, ret);
+ if (err < 0 && ret > 0)
+ ret = err;
+ }
+ return ret;
+}
+EXPORT_SYMBOL_GPL(blkdev_aio_write);
+
+/*
* Try to release a page associated with block device when the system
* is under memory pressure.
*/
@@ -1436,7 +1462,7 @@ const struct file_operations def_blk_fops = {
.read = do_sync_read,
.write = do_sync_write,
.aio_read = generic_file_aio_read,
- .aio_write = generic_file_aio_write_nolock,
+ .aio_write = blkdev_aio_write,
.mmap = generic_file_mmap,
.fsync = block_fsync,
.unlocked_ioctl = block_ioctl,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index e83be2e4602c..6c4173146bb7 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -772,7 +772,7 @@ static void btree_invalidatepage(struct page *page, unsigned long offset)
}
}
-static struct address_space_operations btree_aops = {
+static const struct address_space_operations btree_aops = {
.readpage = btree_readpage,
.writepage = btree_writepage,
.writepages = btree_writepages,
@@ -1352,6 +1352,7 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
{
int err;
+ bdi->name = "btrfs";
bdi->capabilities = BDI_CAP_MAP_COPY;
err = bdi_init(bdi);
if (err)
@@ -1599,6 +1600,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
sb->s_blocksize = 4096;
sb->s_blocksize_bits = blksize_bits(4096);
+ sb->s_bdi = &fs_info->bdi;
/*
* we set the i_size on the btree inode to the max possible int.
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 72a2b9c28e9f..535f85ba104f 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1511,7 +1511,8 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans,
static void btrfs_issue_discard(struct block_device *bdev,
u64 start, u64 len)
{
- blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL);
+ blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL,
+ DISCARD_FL_BARRIER);
}
#endif
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 272b9b2bea86..9096fd0ca3ca 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -55,13 +55,13 @@ struct btrfs_iget_args {
struct btrfs_root *root;
};
-static struct inode_operations btrfs_dir_inode_operations;
-static struct inode_operations btrfs_symlink_inode_operations;
-static struct inode_operations btrfs_dir_ro_inode_operations;
-static struct inode_operations btrfs_special_inode_operations;
-static struct inode_operations btrfs_file_inode_operations;
-static struct address_space_operations btrfs_aops;
-static struct address_space_operations btrfs_symlink_aops;
+static const struct inode_operations btrfs_dir_inode_operations;
+static const struct inode_operations btrfs_symlink_inode_operations;
+static const struct inode_operations btrfs_dir_ro_inode_operations;
+static const struct inode_operations btrfs_special_inode_operations;
+static const struct inode_operations btrfs_file_inode_operations;
+static const struct address_space_operations btrfs_aops;
+static const struct address_space_operations btrfs_symlink_aops;
static struct file_operations btrfs_dir_file_operations;
static struct extent_io_ops btrfs_extent_io_ops;
@@ -3099,8 +3099,12 @@ static void inode_tree_add(struct inode *inode)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_inode *entry;
- struct rb_node **p = &root->inode_tree.rb_node;
- struct rb_node *parent = NULL;
+ struct rb_node **p;
+ struct rb_node *parent;
+
+again:
+ p = &root->inode_tree.rb_node;
+ parent = NULL;
spin_lock(&root->inode_lock);
while (*p) {
@@ -3108,13 +3112,16 @@ static void inode_tree_add(struct inode *inode)
entry = rb_entry(parent, struct btrfs_inode, rb_node);
if (inode->i_ino < entry->vfs_inode.i_ino)
- p = &(*p)->rb_left;
+ p = &parent->rb_left;
else if (inode->i_ino > entry->vfs_inode.i_ino)
- p = &(*p)->rb_right;
+ p = &parent->rb_right;
else {
WARN_ON(!(entry->vfs_inode.i_state &
(I_WILL_FREE | I_FREEING | I_CLEAR)));
- break;
+ rb_erase(parent, &root->inode_tree);
+ RB_CLEAR_NODE(parent);
+ spin_unlock(&root->inode_lock);
+ goto again;
}
}
rb_link_node(&BTRFS_I(inode)->rb_node, parent, p);
@@ -3126,12 +3133,12 @@ static void inode_tree_del(struct inode *inode)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
+ spin_lock(&root->inode_lock);
if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) {
- spin_lock(&root->inode_lock);
rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree);
- spin_unlock(&root->inode_lock);
RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
}
+ spin_unlock(&root->inode_lock);
}
static noinline void init_btrfs_i(struct inode *inode)
@@ -5194,7 +5201,7 @@ static int btrfs_permission(struct inode *inode, int mask)
return generic_permission(inode, mask, btrfs_check_acl);
}
-static struct inode_operations btrfs_dir_inode_operations = {
+static const struct inode_operations btrfs_dir_inode_operations = {
.getattr = btrfs_getattr,
.lookup = btrfs_lookup,
.create = btrfs_create,
@@ -5212,7 +5219,7 @@ static struct inode_operations btrfs_dir_inode_operations = {
.removexattr = btrfs_removexattr,
.permission = btrfs_permission,
};
-static struct inode_operations btrfs_dir_ro_inode_operations = {
+static const struct inode_operations btrfs_dir_ro_inode_operations = {
.lookup = btrfs_lookup,
.permission = btrfs_permission,
};
@@ -5252,7 +5259,7 @@ static struct extent_io_ops btrfs_extent_io_ops = {
*
* For now we're avoiding this by dropping bmap.
*/
-static struct address_space_operations btrfs_aops = {
+static const struct address_space_operations btrfs_aops = {
.readpage = btrfs_readpage,
.writepage = btrfs_writepage,
.writepages = btrfs_writepages,
@@ -5264,14 +5271,14 @@ static struct address_space_operations btrfs_aops = {
.set_page_dirty = btrfs_set_page_dirty,
};
-static struct address_space_operations btrfs_symlink_aops = {
+static const struct address_space_operations btrfs_symlink_aops = {
.readpage = btrfs_readpage,
.writepage = btrfs_writepage,
.invalidatepage = btrfs_invalidatepage,
.releasepage = btrfs_releasepage,
};
-static struct inode_operations btrfs_file_inode_operations = {
+static const struct inode_operations btrfs_file_inode_operations = {
.truncate = btrfs_truncate,
.getattr = btrfs_getattr,
.setattr = btrfs_setattr,
@@ -5283,7 +5290,7 @@ static struct inode_operations btrfs_file_inode_operations = {
.fallocate = btrfs_fallocate,
.fiemap = btrfs_fiemap,
};
-static struct inode_operations btrfs_special_inode_operations = {
+static const struct inode_operations btrfs_special_inode_operations = {
.getattr = btrfs_getattr,
.setattr = btrfs_setattr,
.permission = btrfs_permission,
@@ -5292,7 +5299,7 @@ static struct inode_operations btrfs_special_inode_operations = {
.listxattr = btrfs_listxattr,
.removexattr = btrfs_removexattr,
};
-static struct inode_operations btrfs_symlink_inode_operations = {
+static const struct inode_operations btrfs_symlink_inode_operations = {
.readlink = generic_readlink,
.follow_link = page_follow_link_light,
.put_link = page_put_link,
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index d6f0806c682f..7b2f401e604e 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -740,7 +740,6 @@ int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start,
.nr_to_write = mapping->nrpages * 2,
.range_start = start,
.range_end = end,
- .for_writepages = 1,
};
return btrfs_writepages(mapping, &wbc);
}
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 6d6d06cb6dfc..2db17cd66fc5 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -51,7 +51,7 @@
#include "export.h"
#include "compression.h"
-static struct super_operations btrfs_super_ops;
+static const struct super_operations btrfs_super_ops;
static void btrfs_put_super(struct super_block *sb)
{
@@ -675,7 +675,7 @@ static int btrfs_unfreeze(struct super_block *sb)
return 0;
}
-static struct super_operations btrfs_super_ops = {
+static const struct super_operations btrfs_super_ops = {
.delete_inode = btrfs_delete_inode,
.put_super = btrfs_put_super,
.sync_fs = btrfs_sync_fs,
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index d91b0de7c502..30c0d45c1b5e 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2605,7 +2605,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
extent);
cs = btrfs_file_extent_offset(src, extent);
cl = btrfs_file_extent_num_bytes(src,
- extent);;
+ extent);
if (btrfs_file_extent_compression(src,
extent)) {
cs = 0;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 5dbefd11b4af..5cf405b0828d 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -260,7 +260,7 @@ loop_lock:
num_run++;
batch_run++;
- if (bio_sync(cur))
+ if (bio_rw_flagged(cur, BIO_RW_SYNCIO))
num_sync_run++;
if (need_resched()) {
@@ -2903,7 +2903,7 @@ static noinline int schedule_bio(struct btrfs_root *root,
bio->bi_rw |= rw;
spin_lock(&device->io_lock);
- if (bio_sync(bio))
+ if (bio_rw_flagged(bio, BIO_RW_SYNCIO))
pending_bios = &device->pending_sync_bios;
else
pending_bios = &device->pending_bios;
diff --git a/fs/buffer.c b/fs/buffer.c
index a3ef091a45bd..90a98865b0cc 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -281,7 +281,7 @@ static void free_more_memory(void)
struct zone *zone;
int nid;
- wakeup_pdflush(1024);
+ wakeup_flusher_threads(1024);
yield();
for_each_online_node(nid) {
@@ -1165,8 +1165,11 @@ void mark_buffer_dirty(struct buffer_head *bh)
if (!test_set_buffer_dirty(bh)) {
struct page *page = bh->b_page;
- if (!TestSetPageDirty(page))
- __set_page_dirty(page, page_mapping(page), 0);
+ if (!TestSetPageDirty(page)) {
+ struct address_space *mapping = page_mapping(page);
+ if (mapping)
+ __set_page_dirty(page, mapping, 0);
+ }
}
}
diff --git a/fs/char_dev.c b/fs/char_dev.c
index a173551e19d7..3cbc57f932d2 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -31,6 +31,7 @@
* - no readahead or I/O queue unplugging required
*/
struct backing_dev_info directly_mappable_cdev_bdi = {
+ .name = "char",
.capabilities = (
#ifdef CONFIG_MMU
/* permit private copies of the data to be taken */
@@ -237,8 +238,10 @@ int alloc_chrdev_region(dev_t *dev, unsigned baseminor, unsigned count,
}
/**
- * register_chrdev() - Register a major number for character devices.
+ * __register_chrdev() - create and register a cdev occupying a range of minors
* @major: major device number or 0 for dynamic allocation
+ * @baseminor: first of the requested range of minor numbers
+ * @count: the number of minor numbers required
* @name: name of this range of devices
* @fops: file operations associated with this devices
*
@@ -254,19 +257,17 @@ int alloc_chrdev_region(dev_t *dev, unsigned baseminor, unsigned count,
* /dev. It only helps to keep track of the different owners of devices. If
* your module name has only one type of devices it's ok to use e.g. the name
* of the module here.
- *
- * This function registers a range of 256 minor numbers. The first minor number
- * is 0.
*/
-int register_chrdev(unsigned int major, const char *name,
- const struct file_operations *fops)
+int __register_chrdev(unsigned int major, unsigned int baseminor,
+ unsigned int count, const char *name,
+ const struct file_operations *fops)
{
struct char_device_struct *cd;
struct cdev *cdev;
char *s;
int err = -ENOMEM;
- cd = __register_chrdev_region(major, 0, 256, name);
+ cd = __register_chrdev_region(major, baseminor, count, name);
if (IS_ERR(cd))
return PTR_ERR(cd);
@@ -280,7 +281,7 @@ int register_chrdev(unsigned int major, const char *name,
for (s = strchr(kobject_name(&cdev->kobj),'/'); s; s = strchr(s, '/'))
*s = '!';
- err = cdev_add(cdev, MKDEV(cd->major, 0), 256);
+ err = cdev_add(cdev, MKDEV(cd->major, baseminor), count);
if (err)
goto out;
@@ -290,7 +291,7 @@ int register_chrdev(unsigned int major, const char *name,
out:
kobject_put(&cdev->kobj);
out2:
- kfree(__unregister_chrdev_region(cd->major, 0, 256));
+ kfree(__unregister_chrdev_region(cd->major, baseminor, count));
return err;
}
@@ -316,10 +317,23 @@ void unregister_chrdev_region(dev_t from, unsigned count)
}
}
-void unregister_chrdev(unsigned int major, const char *name)
+/**
+ * __unregister_chrdev - unregister and destroy a cdev
+ * @major: major device number
+ * @baseminor: first of the range of minor numbers
+ * @count: the number of minor numbers this cdev is occupying
+ * @name: name of this range of devices
+ *
+ * Unregister and destroy the cdev occupying the region described by
+ * @major, @baseminor and @count. This function undoes what
+ * __register_chrdev() did.
+ */
+void __unregister_chrdev(unsigned int major, unsigned int baseminor,
+ unsigned int count, const char *name)
{
struct char_device_struct *cd;
- cd = __unregister_chrdev_region(major, 0, 256);
+
+ cd = __unregister_chrdev_region(major, baseminor, count);
if (cd && cd->cdev)
cdev_del(cd->cdev);
kfree(cd);
@@ -568,6 +582,6 @@ EXPORT_SYMBOL(cdev_alloc);
EXPORT_SYMBOL(cdev_del);
EXPORT_SYMBOL(cdev_add);
EXPORT_SYMBOL(cdev_index);
-EXPORT_SYMBOL(register_chrdev);
-EXPORT_SYMBOL(unregister_chrdev);
+EXPORT_SYMBOL(__register_chrdev);
+EXPORT_SYMBOL(__unregister_chrdev);
EXPORT_SYMBOL(directly_mappable_cdev_bdi);
diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index e85b1e4389e0..145540a316ab 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -3,7 +3,10 @@ Version 1.60
Fix memory leak in reconnect. Fix oops in DFS mount error path.
Set s_maxbytes to smaller (the max that vfs can handle) so that
sendfile will now work over cifs mounts again. Add noforcegid
-and noforceuid mount parameters.
+and noforceuid mount parameters. Fix small mem leak when using
+ntlmv2. Fix 2nd mount to same server but with different port to
+be allowed (rather than reusing the 1st port) - only when the
+user explicitly overrides the port on the 2nd mount.
Version 1.59
------------
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index 606912d8f2a8..fea9e898c4ba 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -142,7 +142,7 @@ char *cifs_compose_mount_options(const char *sb_mountdata,
rc = dns_resolve_server_name_to_ip(*devname, &srvIP);
if (rc != 0) {
cERROR(1, ("%s: Failed to resolve server part of %s to IP: %d",
- __func__, *devname, rc));;
+ __func__, *devname, rc));
goto compose_mount_options_err;
}
/* md_len = strlen(...) + 12 for 'sep+prefixpath='
@@ -385,7 +385,7 @@ out_err:
goto out;
}
-struct inode_operations cifs_dfs_referral_inode_operations = {
+const struct inode_operations cifs_dfs_referral_inode_operations = {
.follow_link = cifs_dfs_follow_mountpoint,
};
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 051caecf7d67..8ec7736ce954 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -125,7 +125,7 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
if (server->addr.sockAddr.sin_family == AF_INET)
sprintf(dp, "ip4=%pI4", &server->addr.sockAddr.sin_addr);
else if (server->addr.sockAddr.sin_family == AF_INET6)
- sprintf(dp, "ip6=%pi6", &server->addr.sockAddr6.sin6_addr);
+ sprintf(dp, "ip6=%pI6", &server->addr.sockAddr6.sin6_addr);
else
goto out;
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 6941c22398a6..7dfe0842a6f6 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -607,7 +607,7 @@ static struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb,
return get_cifs_acl_by_path(cifs_sb, path, pacllen);
pntsd = get_cifs_acl_by_fid(cifs_sb, open_file->netfid, pacllen);
- atomic_dec(&open_file->wrtPending);
+ cifsFileInfo_put(open_file);
return pntsd;
}
@@ -665,7 +665,7 @@ static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
return set_cifs_acl_by_path(cifs_sb, path, pnntsd, acllen);
rc = set_cifs_acl_by_fid(cifs_sb, open_file->netfid, pnntsd, acllen);
- atomic_dec(&open_file->wrtPending);
+ cifsFileInfo_put(open_file);
return rc;
}
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 7c9809523f42..7efe1745494d 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -373,6 +373,7 @@ calc_exit_2:
compare with the NTLM example */
hmac_md5_final(ses->server->ntlmv2_hash, pctxt);
+ kfree(pctxt);
return rc;
}
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 84b75253b05a..d79ce2e95c23 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -50,7 +50,7 @@
#define CIFS_MAGIC_NUMBER 0xFF534D42 /* the first four bytes of SMB PDUs */
#ifdef CONFIG_CIFS_QUOTA
-static struct quotactl_ops cifs_quotactl_ops;
+static const struct quotactl_ops cifs_quotactl_ops;
#endif /* QUOTA */
int cifsFYI = 0;
@@ -361,13 +361,10 @@ cifs_show_address(struct seq_file *s, struct TCP_Server_Info *server)
static int
cifs_show_options(struct seq_file *s, struct vfsmount *m)
{
- struct cifs_sb_info *cifs_sb;
- struct cifsTconInfo *tcon;
-
- cifs_sb = CIFS_SB(m->mnt_sb);
- tcon = cifs_sb->tcon;
+ struct cifs_sb_info *cifs_sb = CIFS_SB(m->mnt_sb);
+ struct cifsTconInfo *tcon = cifs_sb->tcon;
- seq_printf(s, ",unc=%s", cifs_sb->tcon->treeName);
+ seq_printf(s, ",unc=%s", tcon->treeName);
if (tcon->ses->userName)
seq_printf(s, ",username=%s", tcon->ses->userName);
if (tcon->ses->domainName)
@@ -520,7 +517,7 @@ int cifs_xstate_get(struct super_block *sb, struct fs_quota_stat *qstats)
return rc;
}
-static struct quotactl_ops cifs_quotactl_ops = {
+static const struct quotactl_ops cifs_quotactl_ops = {
.set_xquota = cifs_xquota_set,
.get_xquota = cifs_xquota_get,
.set_xstate = cifs_xstate_set,
@@ -989,19 +986,19 @@ static int cifs_oplock_thread(void *dummyarg)
if (try_to_freeze())
continue;
- spin_lock(&GlobalMid_Lock);
- if (list_empty(&GlobalOplock_Q)) {
- spin_unlock(&GlobalMid_Lock);
+ spin_lock(&cifs_oplock_lock);
+ if (list_empty(&cifs_oplock_list)) {
+ spin_unlock(&cifs_oplock_lock);
set_current_state(TASK_INTERRUPTIBLE);
schedule_timeout(39*HZ);
} else {
- oplock_item = list_entry(GlobalOplock_Q.next,
+ oplock_item = list_entry(cifs_oplock_list.next,
struct oplock_q_entry, qhead);
cFYI(1, ("found oplock item to write out"));
pTcon = oplock_item->tcon;
inode = oplock_item->pinode;
netfid = oplock_item->netfid;
- spin_unlock(&GlobalMid_Lock);
+ spin_unlock(&cifs_oplock_lock);
DeleteOplockQEntry(oplock_item);
/* can not grab inode sem here since it would
deadlock when oplock received on delete
@@ -1058,7 +1055,7 @@ init_cifs(void)
int rc = 0;
cifs_proc_init();
INIT_LIST_HEAD(&cifs_tcp_ses_list);
- INIT_LIST_HEAD(&GlobalOplock_Q);
+ INIT_LIST_HEAD(&cifs_oplock_list);
#ifdef CONFIG_CIFS_EXPERIMENTAL
INIT_LIST_HEAD(&GlobalDnotifyReqList);
INIT_LIST_HEAD(&GlobalDnotifyRsp_Q);
@@ -1087,6 +1084,7 @@ init_cifs(void)
rwlock_init(&GlobalSMBSeslock);
rwlock_init(&cifs_tcp_ses_lock);
spin_lock_init(&GlobalMid_Lock);
+ spin_lock_init(&cifs_oplock_lock);
if (cifs_max_pending < 2) {
cifs_max_pending = 2;
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 6c170948300d..ac2b24c192f8 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -67,7 +67,7 @@ extern int cifs_setattr(struct dentry *, struct iattr *);
extern const struct inode_operations cifs_file_inode_ops;
extern const struct inode_operations cifs_symlink_inode_ops;
-extern struct inode_operations cifs_dfs_referral_inode_operations;
+extern const struct inode_operations cifs_dfs_referral_inode_operations;
/* Functions related to files and directories */
@@ -113,5 +113,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
extern const struct export_operations cifs_export_ops;
#endif /* EXPERIMENTAL */
-#define CIFS_VERSION "1.60"
+#define CIFS_VERSION "1.61"
#endif /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 6084d6379c03..6cfc81a32703 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -351,11 +351,24 @@ struct cifsFileInfo {
bool closePend:1; /* file is marked to close */
bool invalidHandle:1; /* file closed via session abend */
bool messageMode:1; /* for pipes: message vs byte mode */
- atomic_t wrtPending; /* handle in use - defer close */
+ atomic_t count; /* reference count */
struct mutex fh_mutex; /* prevents reopen race after dead ses*/
struct cifs_search_info srch_inf;
};
+/* Take a reference on the file private data */
+static inline void cifsFileInfo_get(struct cifsFileInfo *cifs_file)
+{
+ atomic_inc(&cifs_file->count);
+}
+
+/* Release a reference on the file private data */
+static inline void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
+{
+ if (atomic_dec_and_test(&cifs_file->count))
+ kfree(cifs_file);
+}
+
/*
* One of these for each file inode
*/
@@ -656,7 +669,11 @@ GLOBAL_EXTERN rwlock_t cifs_tcp_ses_lock;
*/
GLOBAL_EXTERN rwlock_t GlobalSMBSeslock;
-GLOBAL_EXTERN struct list_head GlobalOplock_Q;
+/* Global list of oplocks */
+GLOBAL_EXTERN struct list_head cifs_oplock_list;
+
+/* Protects the cifs_oplock_list */
+GLOBAL_EXTERN spinlock_t cifs_oplock_lock;
/* Outstanding dir notify requests */
GLOBAL_EXTERN struct list_head GlobalDnotifyReqList;
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 1866bc2927d4..301e307e1279 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -100,110 +100,138 @@ static void mark_open_files_invalid(struct cifsTconInfo *pTcon)
to this tcon */
}
-/* Allocate and return pointer to an SMB request buffer, and set basic
- SMB information in the SMB header. If the return code is zero, this
- function must have filled in request_buf pointer */
+/* reconnect the socket, tcon, and smb session if needed */
static int
-small_smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
- void **request_buf)
+cifs_reconnect_tcon(struct cifsTconInfo *tcon, int smb_command)
{
int rc = 0;
+ struct cifsSesInfo *ses;
+ struct TCP_Server_Info *server;
+ struct nls_table *nls_codepage;
- /* SMBs NegProt, SessSetup, uLogoff do not have tcon yet so
- check for tcp and smb session status done differently
- for those three - in the calling routine */
- if (tcon) {
- if (tcon->tidStatus == CifsExiting) {
- /* only tree disconnect, open, and write,
- (and ulogoff which does not have tcon)
- are allowed as we start force umount */
- if ((smb_command != SMB_COM_WRITE_ANDX) &&
- (smb_command != SMB_COM_OPEN_ANDX) &&
- (smb_command != SMB_COM_TREE_DISCONNECT)) {
- cFYI(1, ("can not send cmd %d while umounting",
- smb_command));
- return -ENODEV;
- }
+ /*
+ * SMBs NegProt, SessSetup, uLogoff do not have tcon yet so check for
+ * tcp and smb session status done differently for those three - in the
+ * calling routine
+ */
+ if (!tcon)
+ return 0;
+
+ ses = tcon->ses;
+ server = ses->server;
+
+ /*
+ * only tree disconnect, open, and write, (and ulogoff which does not
+ * have tcon) are allowed as we start force umount
+ */
+ if (tcon->tidStatus == CifsExiting) {
+ if (smb_command != SMB_COM_WRITE_ANDX &&
+ smb_command != SMB_COM_OPEN_ANDX &&
+ smb_command != SMB_COM_TREE_DISCONNECT) {
+ cFYI(1, ("can not send cmd %d while umounting",
+ smb_command));
+ return -ENODEV;
}
- if ((tcon->ses) && (tcon->ses->status != CifsExiting) &&
- (tcon->ses->server)) {
- struct nls_table *nls_codepage;
- /* Give Demultiplex thread up to 10 seconds to
- reconnect, should be greater than cifs socket
- timeout which is 7 seconds */
- while (tcon->ses->server->tcpStatus ==
- CifsNeedReconnect) {
- wait_event_interruptible_timeout(tcon->ses->server->response_q,
- (tcon->ses->server->tcpStatus ==
- CifsGood), 10 * HZ);
- if (tcon->ses->server->tcpStatus ==
- CifsNeedReconnect) {
- /* on "soft" mounts we wait once */
- if (!tcon->retry ||
- (tcon->ses->status == CifsExiting)) {
- cFYI(1, ("gave up waiting on "
- "reconnect in smb_init"));
- return -EHOSTDOWN;
- } /* else "hard" mount - keep retrying
- until process is killed or server
- comes back on-line */
- } else /* TCP session is reestablished now */
- break;
- }
+ }
- nls_codepage = load_nls_default();
- /* need to prevent multiple threads trying to
- simultaneously reconnect the same SMB session */
- down(&tcon->ses->sesSem);
- if (tcon->ses->need_reconnect)
- rc = cifs_setup_session(0, tcon->ses,
- nls_codepage);
- if (!rc && (tcon->need_reconnect)) {
- mark_open_files_invalid(tcon);
- rc = CIFSTCon(0, tcon->ses, tcon->treeName,
- tcon, nls_codepage);
- up(&tcon->ses->sesSem);
- /* BB FIXME add code to check if wsize needs
- update due to negotiated smb buffer size
- shrinking */
- if (rc == 0) {
- atomic_inc(&tconInfoReconnectCount);
- /* tell server Unix caps we support */
- if (tcon->ses->capabilities & CAP_UNIX)
- reset_cifs_unix_caps(
- 0 /* no xid */,
- tcon,
- NULL /* we do not know sb */,
- NULL /* no vol info */);
- }
+ if (ses->status == CifsExiting)
+ return -EIO;
- cFYI(1, ("reconnect tcon rc = %d", rc));
- /* Removed call to reopen open files here.
- It is safer (and faster) to reopen files
- one at a time as needed in read and write */
-
- /* Check if handle based operation so we
- know whether we can continue or not without
- returning to caller to reset file handle */
- switch (smb_command) {
- case SMB_COM_READ_ANDX:
- case SMB_COM_WRITE_ANDX:
- case SMB_COM_CLOSE:
- case SMB_COM_FIND_CLOSE2:
- case SMB_COM_LOCKING_ANDX: {
- unload_nls(nls_codepage);
- return -EAGAIN;
- }
- }
- } else {
- up(&tcon->ses->sesSem);
- }
- unload_nls(nls_codepage);
+ /*
+ * Give demultiplex thread up to 10 seconds to reconnect, should be
+ * greater than cifs socket timeout which is 7 seconds
+ */
+ while (server->tcpStatus == CifsNeedReconnect) {
+ wait_event_interruptible_timeout(server->response_q,
+ (server->tcpStatus == CifsGood), 10 * HZ);
- } else {
- return -EIO;
+ /* is TCP session is reestablished now ?*/
+ if (server->tcpStatus != CifsNeedReconnect)
+ break;
+
+ /*
+ * on "soft" mounts we wait once. Hard mounts keep
+ * retrying until process is killed or server comes
+ * back on-line
+ */
+ if (!tcon->retry || ses->status == CifsExiting) {
+ cFYI(1, ("gave up waiting on reconnect in smb_init"));
+ return -EHOSTDOWN;
}
}
+
+ if (!ses->need_reconnect && !tcon->need_reconnect)
+ return 0;
+
+ nls_codepage = load_nls_default();
+
+ /*
+ * need to prevent multiple threads trying to simultaneously
+ * reconnect the same SMB session
+ */
+ down(&ses->sesSem);
+ if (ses->need_reconnect)
+ rc = cifs_setup_session(0, ses, nls_codepage);
+
+ /* do we need to reconnect tcon? */
+ if (rc || !tcon->need_reconnect) {
+ up(&ses->sesSem);
+ goto out;
+ }
+
+ mark_open_files_invalid(tcon);
+ rc = CIFSTCon(0, ses, tcon->treeName, tcon, nls_codepage);
+ up(&ses->sesSem);
+ cFYI(1, ("reconnect tcon rc = %d", rc));
+
+ if (rc)
+ goto out;
+
+ /*
+ * FIXME: check if wsize needs updated due to negotiated smb buffer
+ * size shrinking
+ */
+ atomic_inc(&tconInfoReconnectCount);
+
+ /* tell server Unix caps we support */
+ if (ses->capabilities & CAP_UNIX)
+ reset_cifs_unix_caps(0, tcon, NULL, NULL);
+
+ /*
+ * Removed call to reopen open files here. It is safer (and faster) to
+ * reopen files one at a time as needed in read and write.
+ *
+ * FIXME: what about file locks? don't we need to reclaim them ASAP?
+ */
+
+out:
+ /*
+ * Check if handle based operation so we know whether we can continue
+ * or not without returning to caller to reset file handle
+ */
+ switch (smb_command) {
+ case SMB_COM_READ_ANDX:
+ case SMB_COM_WRITE_ANDX:
+ case SMB_COM_CLOSE:
+ case SMB_COM_FIND_CLOSE2:
+ case SMB_COM_LOCKING_ANDX:
+ rc = -EAGAIN;
+ }
+
+ unload_nls(nls_codepage);
+ return rc;
+}
+
+/* Allocate and return pointer to an SMB request buffer, and set basic
+ SMB information in the SMB header. If the return code is zero, this
+ function must have filled in request_buf pointer */
+static int
+small_smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
+ void **request_buf)
+{
+ int rc = 0;
+
+ rc = cifs_reconnect_tcon(tcon, smb_command);
if (rc)
return rc;
@@ -256,101 +284,7 @@ smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
{
int rc = 0;
- /* SMBs NegProt, SessSetup, uLogoff do not have tcon yet so
- check for tcp and smb session status done differently
- for those three - in the calling routine */
- if (tcon) {
- if (tcon->tidStatus == CifsExiting) {
- /* only tree disconnect, open, and write,
- (and ulogoff which does not have tcon)
- are allowed as we start force umount */
- if ((smb_command != SMB_COM_WRITE_ANDX) &&
- (smb_command != SMB_COM_OPEN_ANDX) &&
- (smb_command != SMB_COM_TREE_DISCONNECT)) {
- cFYI(1, ("can not send cmd %d while umounting",
- smb_command));
- return -ENODEV;
- }
- }
-
- if ((tcon->ses) && (tcon->ses->status != CifsExiting) &&
- (tcon->ses->server)) {
- struct nls_table *nls_codepage;
- /* Give Demultiplex thread up to 10 seconds to
- reconnect, should be greater than cifs socket
- timeout which is 7 seconds */
- while (tcon->ses->server->tcpStatus ==
- CifsNeedReconnect) {
- wait_event_interruptible_timeout(tcon->ses->server->response_q,
- (tcon->ses->server->tcpStatus ==
- CifsGood), 10 * HZ);
- if (tcon->ses->server->tcpStatus ==
- CifsNeedReconnect) {
- /* on "soft" mounts we wait once */
- if (!tcon->retry ||
- (tcon->ses->status == CifsExiting)) {
- cFYI(1, ("gave up waiting on "
- "reconnect in smb_init"));
- return -EHOSTDOWN;
- } /* else "hard" mount - keep retrying
- until process is killed or server
- comes on-line */
- } else /* TCP session is reestablished now */
- break;
- }
- nls_codepage = load_nls_default();
- /* need to prevent multiple threads trying to
- simultaneously reconnect the same SMB session */
- down(&tcon->ses->sesSem);
- if (tcon->ses->need_reconnect)
- rc = cifs_setup_session(0, tcon->ses,
- nls_codepage);
- if (!rc && (tcon->need_reconnect)) {
- mark_open_files_invalid(tcon);
- rc = CIFSTCon(0, tcon->ses, tcon->treeName,
- tcon, nls_codepage);
- up(&tcon->ses->sesSem);
- /* BB FIXME add code to check if wsize needs
- update due to negotiated smb buffer size
- shrinking */
- if (rc == 0) {
- atomic_inc(&tconInfoReconnectCount);
- /* tell server Unix caps we support */
- if (tcon->ses->capabilities & CAP_UNIX)
- reset_cifs_unix_caps(
- 0 /* no xid */,
- tcon,
- NULL /* do not know sb */,
- NULL /* no vol info */);
- }
-
- cFYI(1, ("reconnect tcon rc = %d", rc));
- /* Removed call to reopen open files here.
- It is safer (and faster) to reopen files
- one at a time as needed in read and write */
-
- /* Check if handle based operation so we
- know whether we can continue or not without
- returning to caller to reset file handle */
- switch (smb_command) {
- case SMB_COM_READ_ANDX:
- case SMB_COM_WRITE_ANDX:
- case SMB_COM_CLOSE:
- case SMB_COM_FIND_CLOSE2:
- case SMB_COM_LOCKING_ANDX: {
- unload_nls(nls_codepage);
- return -EAGAIN;
- }
- }
- } else {
- up(&tcon->ses->sesSem);
- }
- unload_nls(nls_codepage);
-
- } else {
- return -EIO;
- }
- }
+ rc = cifs_reconnect_tcon(tcon, smb_command);
if (rc)
return rc;
@@ -3961,6 +3895,10 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
if (is_unicode) {
__le16 *tmp = kmalloc(strlen(searchName)*2 + 2,
GFP_KERNEL);
+ if (tmp == NULL) {
+ rc = -ENOMEM;
+ goto parse_DFS_referrals_exit;
+ }
cifsConvertToUCS((__le16 *) tmp, searchName,
PATH_MAX, nls_codepage, remap);
node->path_consumed = cifs_ucs2_bytes(tmp,
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 1f3345d7fa79..d49682433c20 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1377,7 +1377,7 @@ cifs_parse_mount_options(char *options, const char *devname,
}
static struct TCP_Server_Info *
-cifs_find_tcp_session(struct sockaddr_storage *addr)
+cifs_find_tcp_session(struct sockaddr_storage *addr, unsigned short int port)
{
struct list_head *tmp;
struct TCP_Server_Info *server;
@@ -1397,16 +1397,37 @@ cifs_find_tcp_session(struct sockaddr_storage *addr)
if (server->tcpStatus == CifsNew)
continue;
- if (addr->ss_family == AF_INET &&
- (addr4->sin_addr.s_addr !=
- server->addr.sockAddr.sin_addr.s_addr))
- continue;
- else if (addr->ss_family == AF_INET6 &&
- (!ipv6_addr_equal(&server->addr.sockAddr6.sin6_addr,
- &addr6->sin6_addr) ||
- server->addr.sockAddr6.sin6_scope_id !=
- addr6->sin6_scope_id))
- continue;
+ switch (addr->ss_family) {
+ case AF_INET:
+ if (addr4->sin_addr.s_addr ==
+ server->addr.sockAddr.sin_addr.s_addr) {
+ addr4->sin_port = htons(port);
+ /* user overrode default port? */
+ if (addr4->sin_port) {
+ if (addr4->sin_port !=
+ server->addr.sockAddr.sin_port)
+ continue;
+ }
+ break;
+ } else
+ continue;
+
+ case AF_INET6:
+ if (ipv6_addr_equal(&addr6->sin6_addr,
+ &server->addr.sockAddr6.sin6_addr) &&
+ (addr6->sin6_scope_id ==
+ server->addr.sockAddr6.sin6_scope_id)) {
+ addr6->sin6_port = htons(port);
+ /* user overrode default port? */
+ if (addr6->sin6_port) {
+ if (addr6->sin6_port !=
+ server->addr.sockAddr6.sin6_port)
+ continue;
+ }
+ break;
+ } else
+ continue;
+ }
++server->srv_count;
write_unlock(&cifs_tcp_ses_lock);
@@ -1475,7 +1496,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
}
/* see if we already have a matching tcp_ses */
- tcp_ses = cifs_find_tcp_session(&addr);
+ tcp_ses = cifs_find_tcp_session(&addr, volume_info->port);
if (tcp_ses)
return tcp_ses;
@@ -2636,9 +2657,9 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
return -EIO;
smb_buffer = cifs_buf_get();
- if (smb_buffer == NULL) {
+ if (smb_buffer == NULL)
return -ENOMEM;
- }
+
smb_buffer_response = smb_buffer;
header_assemble(smb_buffer, SMB_COM_TREE_CONNECT_ANDX,
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 4326ffd90fa9..a6424cfc0121 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -153,7 +153,7 @@ cifs_fill_fileinfo(struct inode *newinode, __u16 fileHandle,
mutex_init(&pCifsFile->fh_mutex);
mutex_init(&pCifsFile->lock_mutex);
INIT_LIST_HEAD(&pCifsFile->llist);
- atomic_set(&pCifsFile->wrtPending, 0);
+ atomic_set(&pCifsFile->count, 1);
/* set the following in open now
pCifsFile->pfile = file; */
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index c34b7f8a217b..fa7beac8b80e 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -53,11 +53,9 @@ static inline struct cifsFileInfo *cifs_init_private(
private_data->pInode = inode;
private_data->invalidHandle = false;
private_data->closePend = false;
- /* we have to track num writers to the inode, since writepages
- does not tell us which handle the write is for so there can
- be a close (overlapping with write) of the filehandle that
- cifs_writepages chose to use */
- atomic_set(&private_data->wrtPending, 0);
+ /* Initialize reference count to one. The private data is
+ freed on the release of the last reference */
+ atomic_set(&private_data->count, 1);
return private_data;
}
@@ -643,7 +641,7 @@ int cifs_close(struct inode *inode, struct file *file)
if (!pTcon->need_reconnect) {
write_unlock(&GlobalSMBSeslock);
timeout = 2;
- while ((atomic_read(&pSMBFile->wrtPending) != 0)
+ while ((atomic_read(&pSMBFile->count) != 1)
&& (timeout <= 2048)) {
/* Give write a better chance to get to
server ahead of the close. We do not
@@ -657,8 +655,6 @@ int cifs_close(struct inode *inode, struct file *file)
msleep(timeout);
timeout *= 4;
}
- if (atomic_read(&pSMBFile->wrtPending))
- cERROR(1, ("close with pending write"));
if (!pTcon->need_reconnect &&
!pSMBFile->invalidHandle)
rc = CIFSSMBClose(xid, pTcon,
@@ -681,24 +677,7 @@ int cifs_close(struct inode *inode, struct file *file)
list_del(&pSMBFile->flist);
list_del(&pSMBFile->tlist);
write_unlock(&GlobalSMBSeslock);
- timeout = 10;
- /* We waited above to give the SMBWrite a chance to issue
- on the wire (so we do not get SMBWrite returning EBADF
- if writepages is racing with close. Note that writepages
- does not specify a file handle, so it is possible for a file
- to be opened twice, and the application close the "wrong"
- file handle - in these cases we delay long enough to allow
- the SMBWrite to get on the wire before the SMB Close.
- We allow total wait here over 45 seconds, more than
- oplock break time, and more than enough to allow any write
- to complete on the server, or to time out on the client */
- while ((atomic_read(&pSMBFile->wrtPending) != 0)
- && (timeout <= 50000)) {
- cERROR(1, ("writes pending, delay free of handle"));
- msleep(timeout);
- timeout *= 8;
- }
- kfree(file->private_data);
+ cifsFileInfo_put(file->private_data);
file->private_data = NULL;
} else
rc = -EBADF;
@@ -1236,7 +1215,7 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode)
if (!open_file->invalidHandle) {
/* found a good file */
/* lock it so it will not be closed on us */
- atomic_inc(&open_file->wrtPending);
+ cifsFileInfo_get(open_file);
read_unlock(&GlobalSMBSeslock);
return open_file;
} /* else might as well continue, and look for
@@ -1276,7 +1255,7 @@ refind_writable:
if (open_file->pfile &&
((open_file->pfile->f_flags & O_RDWR) ||
(open_file->pfile->f_flags & O_WRONLY))) {
- atomic_inc(&open_file->wrtPending);
+ cifsFileInfo_get(open_file);
if (!open_file->invalidHandle) {
/* found a good writable file */
@@ -1293,7 +1272,7 @@ refind_writable:
else { /* start over in case this was deleted */
/* since the list could be modified */
read_lock(&GlobalSMBSeslock);
- atomic_dec(&open_file->wrtPending);
+ cifsFileInfo_put(open_file);
goto refind_writable;
}
}
@@ -1309,7 +1288,7 @@ refind_writable:
read_lock(&GlobalSMBSeslock);
/* can not use this handle, no write
pending on this one after all */
- atomic_dec(&open_file->wrtPending);
+ cifsFileInfo_put(open_file);
if (open_file->closePend) /* list could have changed */
goto refind_writable;
@@ -1373,7 +1352,7 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
if (open_file) {
bytes_written = cifs_write(open_file->pfile, write_data,
to-from, &offset);
- atomic_dec(&open_file->wrtPending);
+ cifsFileInfo_put(open_file);
/* Does mm or vfs already set times? */
inode->i_atime = inode->i_mtime = current_fs_time(inode->i_sb);
if ((bytes_written > 0) && (offset))
@@ -1562,7 +1541,7 @@ retry:
bytes_to_write, offset,
&bytes_written, iov, n_iov,
long_op);
- atomic_dec(&open_file->wrtPending);
+ cifsFileInfo_put(open_file);
cifs_update_eof(cifsi, offset, bytes_written);
if (rc || bytes_written < bytes_to_write) {
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 82d83839655e..1f09c7619319 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -800,7 +800,7 @@ set_via_filehandle:
if (open_file == NULL)
CIFSSMBClose(xid, pTcon, netfid);
else
- atomic_dec(&open_file->wrtPending);
+ cifsFileInfo_put(open_file);
out:
return rc;
}
@@ -1635,7 +1635,7 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
__u32 npid = open_file->pid;
rc = CIFSSMBSetFileSize(xid, pTcon, attrs->ia_size, nfid,
npid, false);
- atomic_dec(&open_file->wrtPending);
+ cifsFileInfo_put(open_file);
cFYI(1, ("SetFSize for attrs rc = %d", rc));
if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
unsigned int bytes_written;
@@ -1790,7 +1790,7 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
u16 nfid = open_file->netfid;
u32 npid = open_file->pid;
rc = CIFSSMBUnixSetFileInfo(xid, pTcon, args, nfid, npid);
- atomic_dec(&open_file->wrtPending);
+ cifsFileInfo_put(open_file);
} else {
rc = CIFSSMBUnixSetPathInfo(xid, pTcon, full_path, args,
cifs_sb->local_nls,
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 0ad3e2d116a6..1da4ab250eae 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -119,20 +119,19 @@ AllocOplockQEntry(struct inode *pinode, __u16 fid, struct cifsTconInfo *tcon)
temp->pinode = pinode;
temp->tcon = tcon;
temp->netfid = fid;
- spin_lock(&GlobalMid_Lock);
- list_add_tail(&temp->qhead, &GlobalOplock_Q);
- spin_unlock(&GlobalMid_Lock);
+ spin_lock(&cifs_oplock_lock);
+ list_add_tail(&temp->qhead, &cifs_oplock_list);
+ spin_unlock(&cifs_oplock_lock);
}
return temp;
-
}
void DeleteOplockQEntry(struct oplock_q_entry *oplockEntry)
{
- spin_lock(&GlobalMid_Lock);
+ spin_lock(&cifs_oplock_lock);
/* should we check if list empty first? */
list_del(&oplockEntry->qhead);
- spin_unlock(&GlobalMid_Lock);
+ spin_unlock(&cifs_oplock_lock);
kmem_cache_free(cifs_oplock_cachep, oplockEntry);
}
@@ -144,14 +143,14 @@ void DeleteTconOplockQEntries(struct cifsTconInfo *tcon)
if (tcon == NULL)
return;
- spin_lock(&GlobalMid_Lock);
- list_for_each_entry(temp, &GlobalOplock_Q, qhead) {
+ spin_lock(&cifs_oplock_lock);
+ list_for_each_entry(temp, &cifs_oplock_list, qhead) {
if ((temp->tcon) && (temp->tcon == tcon)) {
list_del(&temp->qhead);
kmem_cache_free(cifs_oplock_cachep, temp);
}
}
- spin_unlock(&GlobalMid_Lock);
+ spin_unlock(&cifs_oplock_lock);
}
static int
diff --git a/fs/compat.c b/fs/compat.c
index 94502dab972a..6d6f98fe64a0 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1485,20 +1485,15 @@ int compat_do_execve(char * filename,
if (!bprm)
goto out_files;
- retval = -ERESTARTNOINTR;
- if (mutex_lock_interruptible(&current->cred_guard_mutex))
+ retval = prepare_bprm_creds(bprm);
+ if (retval)
goto out_free;
- current->in_execve = 1;
-
- retval = -ENOMEM;
- bprm->cred = prepare_exec_creds();
- if (!bprm->cred)
- goto out_unlock;
retval = check_unsafe_exec(bprm);
if (retval < 0)
- goto out_unlock;
+ goto out_free;
clear_in_exec = retval;
+ current->in_execve = 1;
file = open_exec(filename);
retval = PTR_ERR(file);
@@ -1547,7 +1542,6 @@ int compat_do_execve(char * filename,
/* execve succeeded */
current->fs->in_exec = 0;
current->in_execve = 0;
- mutex_unlock(&current->cred_guard_mutex);
acct_update_integrals(current);
free_bprm(bprm);
if (displaced)
@@ -1567,10 +1561,7 @@ out_file:
out_unmark:
if (clear_in_exec)
current->fs->in_exec = 0;
-
-out_unlock:
current->in_execve = 0;
- mutex_unlock(&current->cred_guard_mutex);
out_free:
free_bprm(bprm);
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index 4921e7426d95..a2f746066c5d 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -51,6 +51,7 @@ static const struct address_space_operations configfs_aops = {
};
static struct backing_dev_info configfs_backing_dev_info = {
+ .name = "configfs",
.ra_pages = 0, /* No readahead */
.capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
};
diff --git a/fs/dcache.c b/fs/dcache.c
index 9e5cd3c3a6ba..a100fa35a48f 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -32,6 +32,7 @@
#include <linux/swap.h>
#include <linux/bootmem.h>
#include <linux/fs_struct.h>
+#include <linux/hardirq.h>
#include "internal.h"
int sysctl_vfs_cache_pressure __read_mostly = 100;
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 618a60f03886..240cef14fe58 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -106,6 +106,7 @@ struct connection {
#define CF_CONNECT_PENDING 3
#define CF_INIT_PENDING 4
#define CF_IS_OTHERCON 5
+#define CF_CLOSE 6
struct list_head writequeue; /* List of outgoing writequeue_entries */
spinlock_t writequeue_lock;
int (*rx_action) (struct connection *); /* What to do when active */
@@ -299,6 +300,8 @@ static void lowcomms_write_space(struct sock *sk)
static inline void lowcomms_connect_sock(struct connection *con)
{
+ if (test_bit(CF_CLOSE, &con->flags))
+ return;
if (!test_and_set_bit(CF_CONNECT_PENDING, &con->flags))
queue_work(send_workqueue, &con->swork);
}
@@ -926,10 +929,8 @@ static void tcp_connect_to_sock(struct connection *con)
goto out_err;
memset(&saddr, 0, sizeof(saddr));
- if (dlm_nodeid_to_addr(con->nodeid, &saddr)) {
- sock_release(sock);
+ if (dlm_nodeid_to_addr(con->nodeid, &saddr))
goto out_err;
- }
sock->sk->sk_user_data = con;
con->rx_action = receive_from_sock;
@@ -1284,7 +1285,6 @@ out:
static void send_to_sock(struct connection *con)
{
int ret = 0;
- ssize_t(*sendpage) (struct socket *, struct page *, int, size_t, int);
const int msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL;
struct writequeue_entry *e;
int len, offset;
@@ -1293,8 +1293,6 @@ static void send_to_sock(struct connection *con)
if (con->sock == NULL)
goto out_connect;
- sendpage = con->sock->ops->sendpage;
-
spin_lock(&con->writequeue_lock);
for (;;) {
e = list_entry(con->writequeue.next, struct writequeue_entry,
@@ -1309,8 +1307,8 @@ static void send_to_sock(struct connection *con)
ret = 0;
if (len) {
- ret = sendpage(con->sock, e->page, offset, len,
- msg_flags);
+ ret = kernel_sendpage(con->sock, e->page, offset, len,
+ msg_flags);
if (ret == -EAGAIN || ret == 0) {
cond_resched();
goto out;
@@ -1370,6 +1368,13 @@ int dlm_lowcomms_close(int nodeid)
log_print("closing connection to node %d", nodeid);
con = nodeid2con(nodeid, 0);
if (con) {
+ clear_bit(CF_CONNECT_PENDING, &con->flags);
+ clear_bit(CF_WRITE_PENDING, &con->flags);
+ set_bit(CF_CLOSE, &con->flags);
+ if (cancel_work_sync(&con->swork))
+ log_print("canceled swork for node %d", nodeid);
+ if (cancel_work_sync(&con->rwork))
+ log_print("canceled rwork for node %d", nodeid);
clean_one_writequeue(con);
close_connection(con, true);
}
@@ -1395,9 +1400,10 @@ static void process_send_sockets(struct work_struct *work)
if (test_and_clear_bit(CF_CONNECT_PENDING, &con->flags)) {
con->connect_action(con);
+ set_bit(CF_WRITE_PENDING, &con->flags);
}
- clear_bit(CF_WRITE_PENDING, &con->flags);
- send_to_sock(con);
+ if (test_and_clear_bit(CF_WRITE_PENDING, &con->flags))
+ send_to_sock(con);
}
diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c
index ccc9d62c462d..55ea369f43a9 100644
--- a/fs/dlm/netlink.c
+++ b/fs/dlm/netlink.c
@@ -63,7 +63,7 @@ static int send_data(struct sk_buff *skb)
return rv;
}
- return genlmsg_unicast(skb, listener_nlpid);
+ return genlmsg_unicast(&init_net, skb, listener_nlpid);
}
static int user_cmd(struct sk_buff *skb, struct genl_info *info)
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 00b30a2d5466..542f625312f3 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -582,7 +582,7 @@ extern const struct inode_operations ecryptfs_dir_iops;
extern const struct inode_operations ecryptfs_symlink_iops;
extern const struct super_operations ecryptfs_sops;
extern const struct dentry_operations ecryptfs_dops;
-extern struct address_space_operations ecryptfs_aops;
+extern const struct address_space_operations ecryptfs_aops;
extern int ecryptfs_verbosity;
extern unsigned int ecryptfs_message_buf_len;
extern signed long ecryptfs_message_wait_timeout;
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index 5c6bab9786e3..05772aeaa8f4 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -545,7 +545,7 @@ static sector_t ecryptfs_bmap(struct address_space *mapping, sector_t block)
return rc;
}
-struct address_space_operations ecryptfs_aops = {
+const struct address_space_operations ecryptfs_aops = {
.writepage = ecryptfs_writepage,
.readpage = ecryptfs_readpage,
.write_begin = ecryptfs_write_begin,
diff --git a/fs/exec.c b/fs/exec.c
index 4a8849e45b21..434dba778ccc 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -33,7 +33,7 @@
#include <linux/string.h>
#include <linux/init.h>
#include <linux/pagemap.h>
-#include <linux/perf_counter.h>
+#include <linux/perf_event.h>
#include <linux/highmem.h>
#include <linux/spinlock.h>
#include <linux/key.h>
@@ -678,8 +678,8 @@ exit:
}
EXPORT_SYMBOL(open_exec);
-int kernel_read(struct file *file, unsigned long offset,
- char *addr, unsigned long count)
+int kernel_read(struct file *file, loff_t offset,
+ char *addr, unsigned long count)
{
mm_segment_t old_fs;
loff_t pos = offset;
@@ -923,7 +923,7 @@ void set_task_comm(struct task_struct *tsk, char *buf)
task_lock(tsk);
strlcpy(tsk->comm, buf, sizeof(tsk->comm));
task_unlock(tsk);
- perf_counter_comm(tsk);
+ perf_event_comm(tsk);
}
int flush_old_exec(struct linux_binprm * bprm)
@@ -997,7 +997,7 @@ int flush_old_exec(struct linux_binprm * bprm)
* security domain:
*/
if (!get_dumpable(current->mm))
- perf_counter_exit_task(current);
+ perf_event_exit_task(current);
/* An exec changes our domain. We are no longer part of the thread
group */
@@ -1016,6 +1016,35 @@ out:
EXPORT_SYMBOL(flush_old_exec);
/*
+ * Prepare credentials and lock ->cred_guard_mutex.
+ * install_exec_creds() commits the new creds and drops the lock.
+ * Or, if exec fails before, free_bprm() should release ->cred and
+ * and unlock.
+ */
+int prepare_bprm_creds(struct linux_binprm *bprm)
+{
+ if (mutex_lock_interruptible(&current->cred_guard_mutex))
+ return -ERESTARTNOINTR;
+
+ bprm->cred = prepare_exec_creds();
+ if (likely(bprm->cred))
+ return 0;
+
+ mutex_unlock(&current->cred_guard_mutex);
+ return -ENOMEM;
+}
+
+void free_bprm(struct linux_binprm *bprm)
+{
+ free_arg_pages(bprm);
+ if (bprm->cred) {
+ mutex_unlock(&current->cred_guard_mutex);
+ abort_creds(bprm->cred);
+ }
+ kfree(bprm);
+}
+
+/*
* install the new credentials for this executable
*/
void install_exec_creds(struct linux_binprm *bprm)
@@ -1024,12 +1053,13 @@ void install_exec_creds(struct linux_binprm *bprm)
commit_creds(bprm->cred);
bprm->cred = NULL;
-
- /* cred_guard_mutex must be held at least to this point to prevent
+ /*
+ * cred_guard_mutex must be held at least to this point to prevent
* ptrace_attach() from altering our determination of the task's
- * credentials; any time after this it may be unlocked */
-
+ * credentials; any time after this it may be unlocked.
+ */
security_bprm_committed_creds(bprm);
+ mutex_unlock(&current->cred_guard_mutex);
}
EXPORT_SYMBOL(install_exec_creds);
@@ -1246,14 +1276,6 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
EXPORT_SYMBOL(search_binary_handler);
-void free_bprm(struct linux_binprm *bprm)
-{
- free_arg_pages(bprm);
- if (bprm->cred)
- abort_creds(bprm->cred);
- kfree(bprm);
-}
-
/*
* sys_execve() executes a new program.
*/
@@ -1277,20 +1299,15 @@ int do_execve(char * filename,
if (!bprm)
goto out_files;
- retval = -ERESTARTNOINTR;
- if (mutex_lock_interruptible(&current->cred_guard_mutex))
+ retval = prepare_bprm_creds(bprm);
+ if (retval)
goto out_free;
- current->in_execve = 1;
-
- retval = -ENOMEM;
- bprm->cred = prepare_exec_creds();
- if (!bprm->cred)
- goto out_unlock;
retval = check_unsafe_exec(bprm);
if (retval < 0)
- goto out_unlock;
+ goto out_free;
clear_in_exec = retval;
+ current->in_execve = 1;
file = open_exec(filename);
retval = PTR_ERR(file);
@@ -1340,7 +1357,6 @@ int do_execve(char * filename,
/* execve succeeded */
current->fs->in_exec = 0;
current->in_execve = 0;
- mutex_unlock(&current->cred_guard_mutex);
acct_update_integrals(current);
free_bprm(bprm);
if (displaced)
@@ -1360,10 +1376,7 @@ out_file:
out_unmark:
if (clear_in_exec)
current->fs->in_exec = 0;
-
-out_unlock:
current->in_execve = 0;
- mutex_unlock(&current->cred_guard_mutex);
out_free:
free_bprm(bprm);
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index d636e1297cad..a63d44256a70 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -230,7 +230,7 @@ ext2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
return error;
}
-static int
+int
ext2_check_acl(struct inode *inode, int mask)
{
struct posix_acl *acl = ext2_get_acl(inode, ACL_TYPE_ACCESS);
@@ -246,12 +246,6 @@ ext2_check_acl(struct inode *inode, int mask)
return -EAGAIN;
}
-int
-ext2_permission(struct inode *inode, int mask)
-{
- return generic_permission(inode, mask, ext2_check_acl);
-}
-
/*
* Initialize the ACLs of a new inode. Called from ext2_new_inode.
*
diff --git a/fs/ext2/acl.h b/fs/ext2/acl.h
index ecefe478898f..3ff6cbb9ac44 100644
--- a/fs/ext2/acl.h
+++ b/fs/ext2/acl.h
@@ -54,13 +54,13 @@ static inline int ext2_acl_count(size_t size)
#ifdef CONFIG_EXT2_FS_POSIX_ACL
/* acl.c */
-extern int ext2_permission (struct inode *, int);
+extern int ext2_check_acl (struct inode *, int);
extern int ext2_acl_chmod (struct inode *);
extern int ext2_init_acl (struct inode *, struct inode *);
#else
#include <linux/sched.h>
-#define ext2_permission NULL
+#define ext2_check_acl NULL
#define ext2_get_acl NULL
#define ext2_set_acl NULL
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 2b9e47dc9222..a2f3afd1a1c1 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -85,6 +85,6 @@ const struct inode_operations ext2_file_inode_operations = {
.removexattr = generic_removexattr,
#endif
.setattr = ext2_setattr,
- .permission = ext2_permission,
+ .check_acl = ext2_check_acl,
.fiemap = ext2_fiemap,
};
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index e27130341d4f..1c1638f873a4 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -482,7 +482,7 @@ static int ext2_alloc_branch(struct inode *inode,
unlock_buffer(bh);
mark_buffer_dirty_inode(bh, inode);
/* We used to sync bh here if IS_SYNC(inode).
- * But we now rely upon generic_osync_inode()
+ * But we now rely upon generic_write_sync()
* and b_inode_buffers. But not for directories.
*/
if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode))
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index e1dedb0f7873..23701f289e98 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -362,6 +362,10 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry,
if (dir_de) {
if (old_dir != new_dir)
ext2_set_link(old_inode, dir_de, dir_page, new_dir, 0);
+ else {
+ kunmap(dir_page);
+ page_cache_release(dir_page);
+ }
inode_dec_link_count(old_dir);
}
return 0;
@@ -396,7 +400,7 @@ const struct inode_operations ext2_dir_inode_operations = {
.removexattr = generic_removexattr,
#endif
.setattr = ext2_setattr,
- .permission = ext2_permission,
+ .check_acl = ext2_check_acl,
};
const struct inode_operations ext2_special_inode_operations = {
@@ -407,5 +411,5 @@ const struct inode_operations ext2_special_inode_operations = {
.removexattr = generic_removexattr,
#endif
.setattr = ext2_setattr,
- .permission = ext2_permission,
+ .check_acl = ext2_check_acl,
};
diff --git a/fs/ext2/xip.c b/fs/ext2/xip.c
index b72b85884223..c18fbf3e4068 100644
--- a/fs/ext2/xip.c
+++ b/fs/ext2/xip.c
@@ -20,7 +20,7 @@ __inode_direct_access(struct inode *inode, sector_t block,
void **kaddr, unsigned long *pfn)
{
struct block_device *bdev = inode->i_sb->s_bdev;
- struct block_device_operations *ops = bdev->bd_disk->fops;
+ const struct block_device_operations *ops = bdev->bd_disk->fops;
sector_t sector;
sector = block * (PAGE_SIZE / 512); /* ext2 block to bdev sector */
diff --git a/fs/ext3/Kconfig b/fs/ext3/Kconfig
index fb3c1a21b135..522b15498f45 100644
--- a/fs/ext3/Kconfig
+++ b/fs/ext3/Kconfig
@@ -29,23 +29,25 @@ config EXT3_FS
module will be called ext3.
config EXT3_DEFAULTS_TO_ORDERED
- bool "Default to 'data=ordered' in ext3 (legacy option)"
+ bool "Default to 'data=ordered' in ext3"
depends on EXT3_FS
help
- If a filesystem does not explicitly specify a data ordering
- mode, and the journal capability allowed it, ext3 used to
- historically default to 'data=ordered'.
-
- That was a rather unfortunate choice, because it leads to all
- kinds of latency problems, and the 'data=writeback' mode is more
- appropriate these days.
-
- You should probably always answer 'n' here, and if you really
- want to use 'data=ordered' mode, set it in the filesystem itself
- with 'tune2fs -o journal_data_ordered'.
-
- But if you really want to enable the legacy default, you can do
- so by answering 'y' to this question.
+ The journal mode options for ext3 have different tradeoffs
+ between when data is guaranteed to be on disk and
+ performance. The use of "data=writeback" can cause
+ unwritten data to appear in files after an system crash or
+ power failure, which can be a security issue. However,
+ "data=ordered" mode can also result in major performance
+ problems, including seconds-long delays before an fsync()
+ call returns. For details, see:
+
+ http://ext4.wiki.kernel.org/index.php/Ext3_data_mode_tradeoffs
+
+ If you have been historically happy with ext3's performance,
+ data=ordered mode will be a safe choice and you should
+ answer 'y' here. If you understand the reliability and data
+ privacy issues of data=writeback and are willing to make
+ that trade off, answer 'n'.
config EXT3_FS_XATTR
bool "Ext3 extended attributes"
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
index e167bae37ef0..c9b0df376b5f 100644
--- a/fs/ext3/acl.c
+++ b/fs/ext3/acl.c
@@ -238,7 +238,7 @@ ext3_set_acl(handle_t *handle, struct inode *inode, int type,
return error;
}
-static int
+int
ext3_check_acl(struct inode *inode, int mask)
{
struct posix_acl *acl = ext3_get_acl(inode, ACL_TYPE_ACCESS);
@@ -254,12 +254,6 @@ ext3_check_acl(struct inode *inode, int mask)
return -EAGAIN;
}
-int
-ext3_permission(struct inode *inode, int mask)
-{
- return generic_permission(inode, mask, ext3_check_acl);
-}
-
/*
* Initialize the ACLs of a new inode. Called from ext3_new_inode.
*
diff --git a/fs/ext3/acl.h b/fs/ext3/acl.h
index 07d15a3a5969..597334626de9 100644
--- a/fs/ext3/acl.h
+++ b/fs/ext3/acl.h
@@ -54,13 +54,13 @@ static inline int ext3_acl_count(size_t size)
#ifdef CONFIG_EXT3_FS_POSIX_ACL
/* acl.c */
-extern int ext3_permission (struct inode *, int);
+extern int ext3_check_acl (struct inode *, int);
extern int ext3_acl_chmod (struct inode *);
extern int ext3_init_acl (handle_t *, struct inode *, struct inode *);
#else /* CONFIG_EXT3_FS_POSIX_ACL */
#include <linux/sched.h>
-#define ext3_permission NULL
+#define ext3_check_acl NULL
static inline int
ext3_acl_chmod(struct inode *inode)
diff --git a/fs/ext3/file.c b/fs/ext3/file.c
index 5b49704b231b..388bbdfa0b4e 100644
--- a/fs/ext3/file.c
+++ b/fs/ext3/file.c
@@ -51,71 +51,12 @@ static int ext3_release_file (struct inode * inode, struct file * filp)
return 0;
}
-static ssize_t
-ext3_file_write(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos)
-{
- struct file *file = iocb->ki_filp;
- struct inode *inode = file->f_path.dentry->d_inode;
- ssize_t ret;
- int err;
-
- ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
-
- /*
- * Skip flushing if there was an error, or if nothing was written.
- */
- if (ret <= 0)
- return ret;
-
- /*
- * If the inode is IS_SYNC, or is O_SYNC and we are doing data
- * journalling then we need to make sure that we force the transaction
- * to disk to keep all metadata uptodate synchronously.
- */
- if (file->f_flags & O_SYNC) {
- /*
- * If we are non-data-journaled, then the dirty data has
- * already been flushed to backing store by generic_osync_inode,
- * and the inode has been flushed too if there have been any
- * modifications other than mere timestamp updates.
- *
- * Open question --- do we care about flushing timestamps too
- * if the inode is IS_SYNC?
- */
- if (!ext3_should_journal_data(inode))
- return ret;
-
- goto force_commit;
- }
-
- /*
- * So we know that there has been no forced data flush. If the inode
- * is marked IS_SYNC, we need to force one ourselves.
- */
- if (!IS_SYNC(inode))
- return ret;
-
- /*
- * Open question #2 --- should we force data to disk here too? If we
- * don't, the only impact is that data=writeback filesystems won't
- * flush data to disk automatically on IS_SYNC, only metadata (but
- * historically, that is what ext2 has done.)
- */
-
-force_commit:
- err = ext3_force_commit(inode->i_sb);
- if (err)
- return err;
- return ret;
-}
-
const struct file_operations ext3_file_operations = {
.llseek = generic_file_llseek,
.read = do_sync_read,
.write = do_sync_write,
.aio_read = generic_file_aio_read,
- .aio_write = ext3_file_write,
+ .aio_write = generic_file_aio_write,
.unlocked_ioctl = ext3_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = ext3_compat_ioctl,
@@ -137,7 +78,7 @@ const struct inode_operations ext3_file_inode_operations = {
.listxattr = ext3_listxattr,
.removexattr = generic_removexattr,
#endif
- .permission = ext3_permission,
+ .check_acl = ext3_check_acl,
.fiemap = ext3_fiemap,
};
diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c
index d33634119e17..451d166bbe93 100644
--- a/fs/ext3/fsync.c
+++ b/fs/ext3/fsync.c
@@ -23,6 +23,7 @@
*/
#include <linux/time.h>
+#include <linux/blkdev.h>
#include <linux/fs.h>
#include <linux/sched.h>
#include <linux/writeback.h>
@@ -73,7 +74,7 @@ int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync)
}
if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
- goto out;
+ goto flush;
/*
* The VFS has written the file data. If the inode is unaltered
@@ -85,7 +86,16 @@ int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync)
.nr_to_write = 0, /* sys_fsync did this */
};
ret = sync_inode(inode, &wbc);
+ goto out;
}
+flush:
+ /*
+ * In case we didn't commit a transaction, we have to flush
+ * disk caches manually so that data really is on persistent
+ * storage
+ */
+ if (test_opt(inode->i_sb, BARRIER))
+ blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
out:
return ret;
}
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index b49908a167ae..cd098a7b77fc 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -172,10 +172,21 @@ static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
* so before we call here everything must be consistently dirtied against
* this transaction.
*/
-static int ext3_journal_test_restart(handle_t *handle, struct inode *inode)
+static int truncate_restart_transaction(handle_t *handle, struct inode *inode)
{
+ int ret;
+
jbd_debug(2, "restarting handle %p\n", handle);
- return ext3_journal_restart(handle, blocks_for_truncate(inode));
+ /*
+ * Drop truncate_mutex to avoid deadlock with ext3_get_blocks_handle
+ * At this moment, get_block can be called only for blocks inside
+ * i_size since page cache has been already dropped and writes are
+ * blocked by i_mutex. So we can safely drop the truncate_mutex.
+ */
+ mutex_unlock(&EXT3_I(inode)->truncate_mutex);
+ ret = ext3_journal_restart(handle, blocks_for_truncate(inode));
+ mutex_lock(&EXT3_I(inode)->truncate_mutex);
+ return ret;
}
/*
@@ -2072,7 +2083,7 @@ static void ext3_clear_blocks(handle_t *handle, struct inode *inode,
ext3_journal_dirty_metadata(handle, bh);
}
ext3_mark_inode_dirty(handle, inode);
- ext3_journal_test_restart(handle, inode);
+ truncate_restart_transaction(handle, inode);
if (bh) {
BUFFER_TRACE(bh, "retaking write access");
ext3_journal_get_write_access(handle, bh);
@@ -2282,7 +2293,7 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode,
return;
if (try_to_extend_transaction(handle, inode)) {
ext3_mark_inode_dirty(handle, inode);
- ext3_journal_test_restart(handle, inode);
+ truncate_restart_transaction(handle, inode);
}
ext3_free_blocks(handle, inode, nr, 1);
@@ -2892,6 +2903,10 @@ static int ext3_do_update_inode(handle_t *handle,
struct buffer_head *bh = iloc->bh;
int err = 0, rc, block;
+again:
+ /* we can't allow multiple procs in here at once, its a bit racey */
+ lock_buffer(bh);
+
/* For fields not not tracking in the in-memory inode,
* initialise them to zero for new inodes. */
if (ei->i_state & EXT3_STATE_NEW)
@@ -2951,16 +2966,20 @@ static int ext3_do_update_inode(handle_t *handle,
/* If this is the first large file
* created, add a flag to the superblock.
*/
+ unlock_buffer(bh);
err = ext3_journal_get_write_access(handle,
EXT3_SB(sb)->s_sbh);
if (err)
goto out_brelse;
+
ext3_update_dynamic_rev(sb);
EXT3_SET_RO_COMPAT_FEATURE(sb,
EXT3_FEATURE_RO_COMPAT_LARGE_FILE);
handle->h_sync = 1;
err = ext3_journal_dirty_metadata(handle,
EXT3_SB(sb)->s_sbh);
+ /* get our lock and start over */
+ goto again;
}
}
}
@@ -2983,6 +3002,7 @@ static int ext3_do_update_inode(handle_t *handle,
raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
+ unlock_buffer(bh);
rc = ext3_journal_dirty_metadata(handle, bh);
if (!err)
err = rc;
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 6ff7b9730234..aad6400c9b77 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -2445,7 +2445,7 @@ const struct inode_operations ext3_dir_inode_operations = {
.listxattr = ext3_listxattr,
.removexattr = generic_removexattr,
#endif
- .permission = ext3_permission,
+ .check_acl = ext3_check_acl,
};
const struct inode_operations ext3_special_inode_operations = {
@@ -2456,5 +2456,5 @@ const struct inode_operations ext3_special_inode_operations = {
.listxattr = ext3_listxattr,
.removexattr = generic_removexattr,
#endif
- .permission = ext3_permission,
+ .check_acl = ext3_check_acl,
};
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 524b349c6299..72743d360509 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -543,6 +543,19 @@ static inline void ext3_show_quota_options(struct seq_file *seq, struct super_bl
#endif
}
+static char *data_mode_string(unsigned long mode)
+{
+ switch (mode) {
+ case EXT3_MOUNT_JOURNAL_DATA:
+ return "journal";
+ case EXT3_MOUNT_ORDERED_DATA:
+ return "ordered";
+ case EXT3_MOUNT_WRITEBACK_DATA:
+ return "writeback";
+ }
+ return "unknown";
+}
+
/*
* Show an option if
* - it's set to a non-default value OR
@@ -616,13 +629,8 @@ static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs)
if (test_opt(sb, NOBH))
seq_puts(seq, ",nobh");
- if (test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA)
- seq_puts(seq, ",data=journal");
- else if (test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA)
- seq_puts(seq, ",data=ordered");
- else if (test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_WRITEBACK_DATA)
- seq_puts(seq, ",data=writeback");
-
+ seq_printf(seq, ",data=%s", data_mode_string(sbi->s_mount_opt &
+ EXT3_MOUNT_DATA_FLAGS));
if (test_opt(sb, DATA_ERR_ABORT))
seq_puts(seq, ",data_err=abort");
@@ -712,7 +720,7 @@ static ssize_t ext3_quota_read(struct super_block *sb, int type, char *data,
static ssize_t ext3_quota_write(struct super_block *sb, int type,
const char *data, size_t len, loff_t off);
-static struct dquot_operations ext3_quota_operations = {
+static const struct dquot_operations ext3_quota_operations = {
.initialize = dquot_initialize,
.drop = dquot_drop,
.alloc_space = dquot_alloc_space,
@@ -729,7 +737,7 @@ static struct dquot_operations ext3_quota_operations = {
.destroy_dquot = dquot_destroy,
};
-static struct quotactl_ops ext3_qctl_operations = {
+static const struct quotactl_ops ext3_qctl_operations = {
.quota_on = ext3_quota_on,
.quota_off = vfs_quota_off,
.quota_sync = vfs_quota_sync,
@@ -1024,12 +1032,18 @@ static int parse_options (char *options, struct super_block *sb,
datacheck:
if (is_remount) {
if ((sbi->s_mount_opt & EXT3_MOUNT_DATA_FLAGS)
- != data_opt) {
- printk(KERN_ERR
- "EXT3-fs: cannot change data "
- "mode on remount\n");
- return 0;
- }
+ == data_opt)
+ break;
+ printk(KERN_ERR
+ "EXT3-fs (device %s): Cannot change "
+ "data mode on remount. The filesystem "
+ "is mounted in data=%s mode and you "
+ "try to remount it in data=%s mode.\n",
+ sb->s_id,
+ data_mode_string(sbi->s_mount_opt &
+ EXT3_MOUNT_DATA_FLAGS),
+ data_mode_string(data_opt));
+ return 0;
} else {
sbi->s_mount_opt &= ~EXT3_MOUNT_DATA_FLAGS;
sbi->s_mount_opt |= data_opt;
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index 418b6f3b0ae8..d5c0ea2e8f2d 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -37,7 +37,7 @@ config EXT4DEV_COMPAT
To enable backwards compatibility so that systems that are
still expecting to mount ext4 filesystems using ext4dev,
- chose Y here. This feature will go away by 2.6.31, so
+ choose Y here. This feature will go away by 2.6.31, so
please arrange to get your userspace programs fixed!
config EXT4_FS_XATTR
@@ -77,3 +77,12 @@ config EXT4_FS_SECURITY
If you are not using a security module that requires using
extended attributes for file security labels, say N.
+
+config EXT4_DEBUG
+ bool "EXT4 debugging support"
+ depends on EXT4_FS
+ help
+ Enables run-time debugging support for the ext4 filesystem.
+
+ If you select Y here, then you will be able to turn on debugging
+ with a command such as "echo 1 > /sys/kernel/debug/ext4/mballoc-debug"
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index f6d8967149ca..0df88b2a69b0 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -236,7 +236,7 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type,
return error;
}
-static int
+int
ext4_check_acl(struct inode *inode, int mask)
{
struct posix_acl *acl = ext4_get_acl(inode, ACL_TYPE_ACCESS);
@@ -252,12 +252,6 @@ ext4_check_acl(struct inode *inode, int mask)
return -EAGAIN;
}
-int
-ext4_permission(struct inode *inode, int mask)
-{
- return generic_permission(inode, mask, ext4_check_acl);
-}
-
/*
* Initialize the ACLs of a new inode. Called from ext4_new_inode.
*
diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h
index 949789d2bba6..9d843d5deac4 100644
--- a/fs/ext4/acl.h
+++ b/fs/ext4/acl.h
@@ -54,13 +54,13 @@ static inline int ext4_acl_count(size_t size)
#ifdef CONFIG_EXT4_FS_POSIX_ACL
/* acl.c */
-extern int ext4_permission(struct inode *, int);
+extern int ext4_check_acl(struct inode *, int);
extern int ext4_acl_chmod(struct inode *);
extern int ext4_init_acl(handle_t *, struct inode *, struct inode *);
#else /* CONFIG_EXT4_FS_POSIX_ACL */
#include <linux/sched.h>
-#define ext4_permission NULL
+#define ext4_check_acl NULL
static inline int
ext4_acl_chmod(struct inode *inode)
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index e2126d70dff5..1d0418980f8d 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -478,7 +478,7 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
* new bitmap information
*/
set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
- ext4_mb_update_group_info(grp, blocks_freed);
+ grp->bb_free += blocks_freed;
up_write(&grp->alloc_sem);
/* We dirtied the bitmap block */
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 9714db393efe..e227eea23f05 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -67,27 +67,29 @@ typedef unsigned int ext4_group_t;
/* prefer goal again. length */
-#define EXT4_MB_HINT_MERGE 1
+#define EXT4_MB_HINT_MERGE 0x0001
/* blocks already reserved */
-#define EXT4_MB_HINT_RESERVED 2
+#define EXT4_MB_HINT_RESERVED 0x0002
/* metadata is being allocated */
-#define EXT4_MB_HINT_METADATA 4
+#define EXT4_MB_HINT_METADATA 0x0004
/* first blocks in the file */
-#define EXT4_MB_HINT_FIRST 8
+#define EXT4_MB_HINT_FIRST 0x0008
/* search for the best chunk */
-#define EXT4_MB_HINT_BEST 16
+#define EXT4_MB_HINT_BEST 0x0010
/* data is being allocated */
-#define EXT4_MB_HINT_DATA 32
+#define EXT4_MB_HINT_DATA 0x0020
/* don't preallocate (for tails) */
-#define EXT4_MB_HINT_NOPREALLOC 64
+#define EXT4_MB_HINT_NOPREALLOC 0x0040
/* allocate for locality group */
-#define EXT4_MB_HINT_GROUP_ALLOC 128
+#define EXT4_MB_HINT_GROUP_ALLOC 0x0080
/* allocate goal blocks or none */
-#define EXT4_MB_HINT_GOAL_ONLY 256
+#define EXT4_MB_HINT_GOAL_ONLY 0x0100
/* goal is meaningful */
-#define EXT4_MB_HINT_TRY_GOAL 512
+#define EXT4_MB_HINT_TRY_GOAL 0x0200
/* blocks already pre-reserved by delayed allocation */
-#define EXT4_MB_DELALLOC_RESERVED 1024
+#define EXT4_MB_DELALLOC_RESERVED 0x0400
+/* We are doing stream allocation */
+#define EXT4_MB_STREAM_ALLOC 0x0800
struct ext4_allocation_request {
@@ -112,6 +114,21 @@ struct ext4_allocation_request {
};
/*
+ * For delayed allocation tracking
+ */
+struct mpage_da_data {
+ struct inode *inode;
+ sector_t b_blocknr; /* start block number of extent */
+ size_t b_size; /* size of extent */
+ unsigned long b_state; /* state of the extent */
+ unsigned long first_page, next_page; /* extent of pages */
+ struct writeback_control *wbc;
+ int io_done;
+ int pages_written;
+ int retval;
+};
+
+/*
* Special inodes numbers
*/
#define EXT4_BAD_INO 1 /* Bad blocks inode */
@@ -251,7 +268,6 @@ struct flex_groups {
#define EXT4_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/
#define EXT4_HUGE_FILE_FL 0x00040000 /* Set to each huge file */
#define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */
-#define EXT4_EXT_MIGRATE 0x00100000 /* Inode is migrating */
#define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */
#define EXT4_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */
@@ -289,6 +305,7 @@ static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags)
#define EXT4_STATE_XATTR 0x00000004 /* has in-inode xattrs */
#define EXT4_STATE_NO_EXPAND 0x00000008 /* No space for expansion */
#define EXT4_STATE_DA_ALLOC_CLOSE 0x00000010 /* Alloc DA blks on close */
+#define EXT4_STATE_EXT_MIGRATE 0x00000020 /* Inode is migrating */
/* Used to pass group descriptor data when online resize is done */
struct ext4_new_group_input {
@@ -386,6 +403,9 @@ struct ext4_mount_options {
#endif
};
+/* Max physical block we can addres w/o extents */
+#define EXT4_MAX_BLOCK_FILE_PHYS 0xFFFFFFFF
+
/*
* Structure of an inode on the disk
*/
@@ -456,7 +476,6 @@ struct move_extent {
__u64 len; /* block length to be moved */
__u64 moved_len; /* moved block length */
};
-#define MAX_DEFRAG_SIZE ((1UL<<31) - 1)
#define EXT4_EPOCH_BITS 2
#define EXT4_EPOCH_MASK ((1 << EXT4_EPOCH_BITS) - 1)
@@ -694,7 +713,6 @@ struct ext4_inode_info {
#define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */
#define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */
#define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */
-#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */
#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */
#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */
@@ -841,6 +859,7 @@ struct ext4_sb_info {
unsigned long s_gdb_count; /* Number of group descriptor blocks */
unsigned long s_desc_per_block; /* Number of group descriptors per block */
ext4_group_t s_groups_count; /* Number of groups in the fs */
+ ext4_group_t s_blockfile_groups;/* Groups acceptable for non-extent files */
unsigned long s_overhead_last; /* Last calculated overhead */
unsigned long s_blocks_last; /* Last seen block count */
loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */
@@ -950,6 +969,7 @@ struct ext4_sb_info {
atomic_t s_mb_lost_chunks;
atomic_t s_mb_preallocated;
atomic_t s_mb_discarded;
+ atomic_t s_lock_busy;
/* locality groups */
struct ext4_locality_group *s_locality_groups;
@@ -1340,8 +1360,6 @@ extern void ext4_mb_free_blocks(handle_t *, struct inode *,
ext4_fsblk_t, unsigned long, int, unsigned long *);
extern int ext4_mb_add_groupinfo(struct super_block *sb,
ext4_group_t i, struct ext4_group_desc *desc);
-extern void ext4_mb_update_group_info(struct ext4_group_info *grp,
- ext4_grpblk_t add);
extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t);
extern void ext4_mb_put_buddy_cache_lock(struct super_block *,
ext4_group_t, int);
@@ -1367,6 +1385,7 @@ extern int ext4_change_inode_journal_flag(struct inode *, int);
extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
extern int ext4_can_truncate(struct inode *inode);
extern void ext4_truncate(struct inode *);
+extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks);
extern void ext4_set_inode_flags(struct inode *);
extern void ext4_get_inode_flags(struct ext4_inode_info *);
extern int ext4_alloc_da_blocks(struct inode *inode);
@@ -1575,15 +1594,18 @@ static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
struct ext4_group_info {
unsigned long bb_state;
struct rb_root bb_free_root;
- unsigned short bb_first_free;
- unsigned short bb_free;
- unsigned short bb_fragments;
+ ext4_grpblk_t bb_first_free; /* first free block */
+ ext4_grpblk_t bb_free; /* total free blocks */
+ ext4_grpblk_t bb_fragments; /* nr of freespace fragments */
struct list_head bb_prealloc_list;
#ifdef DOUBLE_CHECK
void *bb_bitmap;
#endif
struct rw_semaphore alloc_sem;
- unsigned short bb_counters[];
+ ext4_grpblk_t bb_counters[]; /* Nr of free power-of-two-block
+ * regions, index is order.
+ * bb_counters[3] = 5 means
+ * 5 free 8-block regions. */
};
#define EXT4_GROUP_INFO_NEED_INIT_BIT 0
@@ -1591,15 +1613,42 @@ struct ext4_group_info {
#define EXT4_MB_GRP_NEED_INIT(grp) \
(test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
+#define EXT4_MAX_CONTENTION 8
+#define EXT4_CONTENTION_THRESHOLD 2
+
static inline spinlock_t *ext4_group_lock_ptr(struct super_block *sb,
ext4_group_t group)
{
return bgl_lock_ptr(EXT4_SB(sb)->s_blockgroup_lock, group);
}
+/*
+ * Returns true if the filesystem is busy enough that attempts to
+ * access the block group locks has run into contention.
+ */
+static inline int ext4_fs_is_busy(struct ext4_sb_info *sbi)
+{
+ return (atomic_read(&sbi->s_lock_busy) > EXT4_CONTENTION_THRESHOLD);
+}
+
static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
{
- spin_lock(ext4_group_lock_ptr(sb, group));
+ spinlock_t *lock = ext4_group_lock_ptr(sb, group);
+ if (spin_trylock(lock))
+ /*
+ * We're able to grab the lock right away, so drop the
+ * lock contention counter.
+ */
+ atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, -1, 0);
+ else {
+ /*
+ * The lock is busy, so bump the contention counter,
+ * and then wait on the spin lock.
+ */
+ atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, 1,
+ EXT4_MAX_CONTENTION);
+ spin_lock(lock);
+ }
}
static inline void ext4_unlock_group(struct super_block *sb,
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 20a84105a10b..61652f1d15e6 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -43,8 +43,7 @@
#define CHECK_BINSEARCH__
/*
- * If EXT_DEBUG is defined you can use the 'extdebug' mount option
- * to get lots of info about what's going on.
+ * Turn on EXT_DEBUG to get lots of info about extents operations.
*/
#define EXT_DEBUG__
#ifdef EXT_DEBUG
@@ -138,6 +137,7 @@ typedef int (*ext_prepare_callback)(struct inode *, struct ext4_ext_path *,
#define EXT_BREAK 1
#define EXT_REPEAT 2
+/* Maximum logical block in a file; ext4_extent's ee_block is __le32 */
#define EXT_MAX_BLOCK 0xffffffff
/*
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index eb27fd0f2ee8..6a9409920dee 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -44,7 +44,7 @@ int __ext4_journal_forget(const char *where, handle_t *handle,
handle, err);
}
else
- brelse(bh);
+ bforget(bh);
return err;
}
@@ -60,7 +60,7 @@ int __ext4_journal_revoke(const char *where, handle_t *handle,
handle, err);
}
else
- brelse(bh);
+ bforget(bh);
return err;
}
@@ -89,7 +89,10 @@ int __ext4_handle_dirty_metadata(const char *where, handle_t *handle,
ext4_journal_abort_handle(where, __func__, bh,
handle, err);
} else {
- mark_buffer_dirty(bh);
+ if (inode && bh)
+ mark_buffer_dirty_inode(bh, inode);
+ else
+ mark_buffer_dirty(bh);
if (inode && inode_needs_sync(inode)) {
sync_dirty_buffer(bh);
if (buffer_req(bh) && !buffer_uptodate(bh)) {
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 73ebfb44ad75..7a3832577923 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -93,7 +93,9 @@ static void ext4_idx_store_pblock(struct ext4_extent_idx *ix, ext4_fsblk_t pb)
ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
}
-static int ext4_ext_journal_restart(handle_t *handle, int needed)
+static int ext4_ext_truncate_extend_restart(handle_t *handle,
+ struct inode *inode,
+ int needed)
{
int err;
@@ -104,7 +106,14 @@ static int ext4_ext_journal_restart(handle_t *handle, int needed)
err = ext4_journal_extend(handle, needed);
if (err <= 0)
return err;
- return ext4_journal_restart(handle, needed);
+ err = ext4_truncate_restart_trans(handle, inode, needed);
+ /*
+ * We have dropped i_data_sem so someone might have cached again
+ * an extent we are going to truncate.
+ */
+ ext4_ext_invalidate_cache(inode);
+
+ return err;
}
/*
@@ -220,57 +229,65 @@ ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
return newblock;
}
-static int ext4_ext_space_block(struct inode *inode)
+static inline int ext4_ext_space_block(struct inode *inode, int check)
{
int size;
size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
/ sizeof(struct ext4_extent);
+ if (!check) {
#ifdef AGGRESSIVE_TEST
- if (size > 6)
- size = 6;
+ if (size > 6)
+ size = 6;
#endif
+ }
return size;
}
-static int ext4_ext_space_block_idx(struct inode *inode)
+static inline int ext4_ext_space_block_idx(struct inode *inode, int check)
{
int size;
size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
/ sizeof(struct ext4_extent_idx);
+ if (!check) {
#ifdef AGGRESSIVE_TEST
- if (size > 5)
- size = 5;
+ if (size > 5)
+ size = 5;
#endif
+ }
return size;
}
-static int ext4_ext_space_root(struct inode *inode)
+static inline int ext4_ext_space_root(struct inode *inode, int check)
{
int size;
size = sizeof(EXT4_I(inode)->i_data);
size -= sizeof(struct ext4_extent_header);
size /= sizeof(struct ext4_extent);
+ if (!check) {
#ifdef AGGRESSIVE_TEST
- if (size > 3)
- size = 3;
+ if (size > 3)
+ size = 3;
#endif
+ }
return size;
}
-static int ext4_ext_space_root_idx(struct inode *inode)
+static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
{
int size;
size = sizeof(EXT4_I(inode)->i_data);
size -= sizeof(struct ext4_extent_header);
size /= sizeof(struct ext4_extent_idx);
+ if (!check) {
#ifdef AGGRESSIVE_TEST
- if (size > 4)
- size = 4;
+ if (size > 4)
+ size = 4;
#endif
+ }
return size;
}
@@ -284,9 +301,9 @@ int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks)
int lcap, icap, rcap, leafs, idxs, num;
int newextents = blocks;
- rcap = ext4_ext_space_root_idx(inode);
- lcap = ext4_ext_space_block(inode);
- icap = ext4_ext_space_block_idx(inode);
+ rcap = ext4_ext_space_root_idx(inode, 0);
+ lcap = ext4_ext_space_block(inode, 0);
+ icap = ext4_ext_space_block_idx(inode, 0);
/* number of new leaf blocks needed */
num = leafs = (newextents + lcap - 1) / lcap;
@@ -311,14 +328,14 @@ ext4_ext_max_entries(struct inode *inode, int depth)
if (depth == ext_depth(inode)) {
if (depth == 0)
- max = ext4_ext_space_root(inode);
+ max = ext4_ext_space_root(inode, 1);
else
- max = ext4_ext_space_root_idx(inode);
+ max = ext4_ext_space_root_idx(inode, 1);
} else {
if (depth == 0)
- max = ext4_ext_space_block(inode);
+ max = ext4_ext_space_block(inode, 1);
else
- max = ext4_ext_space_block_idx(inode);
+ max = ext4_ext_space_block_idx(inode, 1);
}
return max;
@@ -437,8 +454,9 @@ static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
ext_debug(" %d->%llu", le32_to_cpu(path->p_idx->ei_block),
idx_pblock(path->p_idx));
} else if (path->p_ext) {
- ext_debug(" %d:%d:%llu ",
+ ext_debug(" %d:[%d]%d:%llu ",
le32_to_cpu(path->p_ext->ee_block),
+ ext4_ext_is_uninitialized(path->p_ext),
ext4_ext_get_actual_len(path->p_ext),
ext_pblock(path->p_ext));
} else
@@ -460,8 +478,11 @@ static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
eh = path[depth].p_hdr;
ex = EXT_FIRST_EXTENT(eh);
+ ext_debug("Displaying leaf extents for inode %lu\n", inode->i_ino);
+
for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) {
- ext_debug("%d:%d:%llu ", le32_to_cpu(ex->ee_block),
+ ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block),
+ ext4_ext_is_uninitialized(ex),
ext4_ext_get_actual_len(ex), ext_pblock(ex));
}
ext_debug("\n");
@@ -580,9 +601,10 @@ ext4_ext_binsearch(struct inode *inode,
}
path->p_ext = l - 1;
- ext_debug(" -> %d:%llu:%d ",
+ ext_debug(" -> %d:%llu:[%d]%d ",
le32_to_cpu(path->p_ext->ee_block),
ext_pblock(path->p_ext),
+ ext4_ext_is_uninitialized(path->p_ext),
ext4_ext_get_actual_len(path->p_ext));
#ifdef CHECK_BINSEARCH
@@ -612,7 +634,7 @@ int ext4_ext_tree_init(handle_t *handle, struct inode *inode)
eh->eh_depth = 0;
eh->eh_entries = 0;
eh->eh_magic = EXT4_EXT_MAGIC;
- eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode));
+ eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0));
ext4_mark_inode_dirty(handle, inode);
ext4_ext_invalidate_cache(inode);
return 0;
@@ -837,7 +859,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
neh = ext_block_hdr(bh);
neh->eh_entries = 0;
- neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode));
+ neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
neh->eh_magic = EXT4_EXT_MAGIC;
neh->eh_depth = 0;
ex = EXT_FIRST_EXTENT(neh);
@@ -850,9 +872,10 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
path[depth].p_ext++;
while (path[depth].p_ext <=
EXT_MAX_EXTENT(path[depth].p_hdr)) {
- ext_debug("move %d:%llu:%d in new leaf %llu\n",
+ ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n",
le32_to_cpu(path[depth].p_ext->ee_block),
ext_pblock(path[depth].p_ext),
+ ext4_ext_is_uninitialized(path[depth].p_ext),
ext4_ext_get_actual_len(path[depth].p_ext),
newblock);
/*memmove(ex++, path[depth].p_ext++,
@@ -912,7 +935,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
neh = ext_block_hdr(bh);
neh->eh_entries = cpu_to_le16(1);
neh->eh_magic = EXT4_EXT_MAGIC;
- neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode));
+ neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
neh->eh_depth = cpu_to_le16(depth - i);
fidx = EXT_FIRST_INDEX(neh);
fidx->ei_block = border;
@@ -1037,9 +1060,9 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
/* old root could have indexes or leaves
* so calculate e_max right way */
if (ext_depth(inode))
- neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode));
+ neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
else
- neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode));
+ neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
neh->eh_magic = EXT4_EXT_MAGIC;
set_buffer_uptodate(bh);
unlock_buffer(bh);
@@ -1054,7 +1077,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
goto out;
curp->p_hdr->eh_magic = EXT4_EXT_MAGIC;
- curp->p_hdr->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode));
+ curp->p_hdr->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0));
curp->p_hdr->eh_entries = cpu_to_le16(1);
curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr);
@@ -1580,9 +1603,11 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
/* try to insert block into found extent and return */
if (ex && ext4_can_extents_be_merged(inode, ex, newext)) {
- ext_debug("append %d block to %d:%d (from %llu)\n",
+ ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n",
+ ext4_ext_is_uninitialized(newext),
ext4_ext_get_actual_len(newext),
le32_to_cpu(ex->ee_block),
+ ext4_ext_is_uninitialized(ex),
ext4_ext_get_actual_len(ex), ext_pblock(ex));
err = ext4_ext_get_access(handle, inode, path + depth);
if (err)
@@ -1651,9 +1676,10 @@ has_space:
if (!nearex) {
/* there is no extent in this leaf, create first one */
- ext_debug("first extent in the leaf: %d:%llu:%d\n",
+ ext_debug("first extent in the leaf: %d:%llu:[%d]%d\n",
le32_to_cpu(newext->ee_block),
ext_pblock(newext),
+ ext4_ext_is_uninitialized(newext),
ext4_ext_get_actual_len(newext));
path[depth].p_ext = EXT_FIRST_EXTENT(eh);
} else if (le32_to_cpu(newext->ee_block)
@@ -1663,10 +1689,11 @@ has_space:
len = EXT_MAX_EXTENT(eh) - nearex;
len = (len - 1) * sizeof(struct ext4_extent);
len = len < 0 ? 0 : len;
- ext_debug("insert %d:%llu:%d after: nearest 0x%p, "
+ ext_debug("insert %d:%llu:[%d]%d after: nearest 0x%p, "
"move %d from 0x%p to 0x%p\n",
le32_to_cpu(newext->ee_block),
ext_pblock(newext),
+ ext4_ext_is_uninitialized(newext),
ext4_ext_get_actual_len(newext),
nearex, len, nearex + 1, nearex + 2);
memmove(nearex + 2, nearex + 1, len);
@@ -1676,10 +1703,11 @@ has_space:
BUG_ON(newext->ee_block == nearex->ee_block);
len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext4_extent);
len = len < 0 ? 0 : len;
- ext_debug("insert %d:%llu:%d before: nearest 0x%p, "
+ ext_debug("insert %d:%llu:[%d]%d before: nearest 0x%p, "
"move %d from 0x%p to 0x%p\n",
le32_to_cpu(newext->ee_block),
ext_pblock(newext),
+ ext4_ext_is_uninitialized(newext),
ext4_ext_get_actual_len(newext),
nearex, len, nearex + 1, nearex + 2);
memmove(nearex + 1, nearex, len);
@@ -2094,7 +2122,8 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
else
uninitialized = 0;
- ext_debug("remove ext %lu:%u\n", ex_ee_block, ex_ee_len);
+ ext_debug("remove ext %u:[%d]%d\n", ex_ee_block,
+ uninitialized, ex_ee_len);
path[depth].p_ext = ex;
a = ex_ee_block > start ? ex_ee_block : start;
@@ -2138,7 +2167,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
}
credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
- err = ext4_ext_journal_restart(handle, credits);
+ err = ext4_ext_truncate_extend_restart(handle, inode, credits);
if (err)
goto out;
@@ -2327,7 +2356,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
if (err == 0) {
ext_inode_hdr(inode)->eh_depth = 0;
ext_inode_hdr(inode)->eh_max =
- cpu_to_le16(ext4_ext_space_root(inode));
+ cpu_to_le16(ext4_ext_space_root(inode, 0));
err = ext4_ext_dirty(handle, inode, path);
}
}
@@ -2743,6 +2772,7 @@ insert:
} else if (err)
goto fix_extent_len;
out:
+ ext4_ext_show_leaf(inode, path);
return err ? err : allocated;
fix_extent_len:
@@ -2786,7 +2816,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
struct ext4_allocation_request ar;
__clear_bit(BH_New, &bh_result->b_state);
- ext_debug("blocks %u/%u requested for inode %u\n",
+ ext_debug("blocks %u/%u requested for inode %lu\n",
iblock, max_blocks, inode->i_ino);
/* check in cache */
@@ -2849,7 +2879,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
newblock = iblock - ee_block + ee_start;
/* number of remaining blocks in the extent */
allocated = ee_len - (iblock - ee_block);
- ext_debug("%u fit into %lu:%d -> %llu\n", iblock,
+ ext_debug("%u fit into %u:%d -> %llu\n", iblock,
ee_block, ee_len, newblock);
/* Do not put uninitialized extent in the cache */
@@ -2950,7 +2980,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
newblock = ext4_mb_new_blocks(handle, &ar, &err);
if (!newblock)
goto out2;
- ext_debug("allocate new block: goal %llu, found %llu/%lu\n",
+ ext_debug("allocate new block: goal %llu, found %llu/%u\n",
ar.goal, newblock, allocated);
/* try to insert new extent into found leaf and return */
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 3f1873fef1c6..5ca3eca70a1e 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -58,10 +58,7 @@ static ssize_t
ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
unsigned long nr_segs, loff_t pos)
{
- struct file *file = iocb->ki_filp;
- struct inode *inode = file->f_path.dentry->d_inode;
- ssize_t ret;
- int err;
+ struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
/*
* If we have encountered a bitmap-format file, the size limit
@@ -81,53 +78,7 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
}
}
- ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
- /*
- * Skip flushing if there was an error, or if nothing was written.
- */
- if (ret <= 0)
- return ret;
-
- /*
- * If the inode is IS_SYNC, or is O_SYNC and we are doing data
- * journalling then we need to make sure that we force the transaction
- * to disk to keep all metadata uptodate synchronously.
- */
- if (file->f_flags & O_SYNC) {
- /*
- * If we are non-data-journaled, then the dirty data has
- * already been flushed to backing store by generic_osync_inode,
- * and the inode has been flushed too if there have been any
- * modifications other than mere timestamp updates.
- *
- * Open question --- do we care about flushing timestamps too
- * if the inode is IS_SYNC?
- */
- if (!ext4_should_journal_data(inode))
- return ret;
-
- goto force_commit;
- }
-
- /*
- * So we know that there has been no forced data flush. If the inode
- * is marked IS_SYNC, we need to force one ourselves.
- */
- if (!IS_SYNC(inode))
- return ret;
-
- /*
- * Open question #2 --- should we force data to disk here too? If we
- * don't, the only impact is that data=writeback filesystems won't
- * flush data to disk automatically on IS_SYNC, only metadata (but
- * historically, that is what ext2 has done.)
- */
-
-force_commit:
- err = ext4_force_commit(inode->i_sb);
- if (err)
- return err;
- return ret;
+ return generic_file_aio_write(iocb, iov, nr_segs, pos);
}
static struct vm_operations_struct ext4_file_vm_ops = {
@@ -207,7 +158,7 @@ const struct inode_operations ext4_file_inode_operations = {
.listxattr = ext4_listxattr,
.removexattr = generic_removexattr,
#endif
- .permission = ext4_permission,
+ .check_acl = ext4_check_acl,
.fallocate = ext4_fallocate,
.fiemap = ext4_fiemap,
};
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 83cf6415f599..07475740b512 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -50,7 +50,7 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
{
struct inode *inode = dentry->d_inode;
journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
- int ret = 0;
+ int err, ret = 0;
J_ASSERT(ext4_journal_current_handle() == NULL);
@@ -79,6 +79,9 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
goto out;
}
+ if (!journal)
+ ret = sync_mapping_buffers(inode->i_mapping);
+
if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
goto out;
@@ -91,10 +94,12 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
.sync_mode = WB_SYNC_ALL,
.nr_to_write = 0, /* sys_fsync did this */
};
- ret = sync_inode(inode, &wbc);
- if (journal && (journal->j_flags & JBD2_BARRIER))
- blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
+ err = sync_inode(inode, &wbc);
+ if (ret == 0)
+ ret = err;
}
out:
+ if (journal && (journal->j_flags & JBD2_BARRIER))
+ blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
return ret;
}
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 29e6dc7299b8..f3624ead4f6c 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -1189,7 +1189,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
x = ext4_count_free(bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8);
printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n",
- i, ext4_free_inodes_count(sb, gdp), x);
+ (unsigned long) i, ext4_free_inodes_count(sb, gdp), x);
bitmap_count += x;
}
brelse(bitmap_bh);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index f9c642b22efa..3a798737e305 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -192,11 +192,24 @@ static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
* so before we call here everything must be consistently dirtied against
* this transaction.
*/
-static int ext4_journal_test_restart(handle_t *handle, struct inode *inode)
+ int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,
+ int nblocks)
{
+ int ret;
+
+ /*
+ * Drop i_data_sem to avoid deadlock with ext4_get_blocks At this
+ * moment, get_block can be called only for blocks inside i_size since
+ * page cache has been already dropped and writes are blocked by
+ * i_mutex. So we can safely drop the i_data_sem here.
+ */
BUG_ON(EXT4_JOURNAL(inode) == NULL);
jbd_debug(2, "restarting handle %p\n", handle);
- return ext4_journal_restart(handle, blocks_for_truncate(inode));
+ up_write(&EXT4_I(inode)->i_data_sem);
+ ret = ext4_journal_restart(handle, blocks_for_truncate(inode));
+ down_write(&EXT4_I(inode)->i_data_sem);
+
+ return ret;
}
/*
@@ -341,9 +354,7 @@ static int ext4_block_to_path(struct inode *inode,
int n = 0;
int final = 0;
- if (i_block < 0) {
- ext4_warning(inode->i_sb, "ext4_block_to_path", "block < 0");
- } else if (i_block < direct_blocks) {
+ if (i_block < direct_blocks) {
offsets[n++] = i_block;
final = direct_blocks;
} else if ((i_block -= direct_blocks) < indirect_blocks) {
@@ -551,15 +562,21 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
*
* Normally this function find the preferred place for block allocation,
* returns it.
+ * Because this is only used for non-extent files, we limit the block nr
+ * to 32 bits.
*/
static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
Indirect *partial)
{
+ ext4_fsblk_t goal;
+
/*
* XXX need to get goal block from mballoc's data structures
*/
- return ext4_find_near(inode, partial);
+ goal = ext4_find_near(inode, partial);
+ goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
+ return goal;
}
/**
@@ -640,6 +657,8 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
if (*err)
goto failed_out;
+ BUG_ON(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS);
+
target -= count;
/* allocate blocks for indirect blocks */
while (index < indirect_blks && count) {
@@ -674,6 +693,7 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
ar.flags = EXT4_MB_HINT_DATA;
current_block = ext4_mb_new_blocks(handle, &ar, err);
+ BUG_ON(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS);
if (*err && (target == blks)) {
/*
@@ -762,8 +782,9 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
BUFFER_TRACE(bh, "call get_create_access");
err = ext4_journal_get_create_access(handle, bh);
if (err) {
+ /* Don't brelse(bh) here; it's done in
+ * ext4_journal_forget() below */
unlock_buffer(bh);
- brelse(bh);
goto failed;
}
@@ -1109,16 +1130,15 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
ext4_discard_preallocations(inode);
}
-static int check_block_validity(struct inode *inode, sector_t logical,
- sector_t phys, int len)
+static int check_block_validity(struct inode *inode, const char *msg,
+ sector_t logical, sector_t phys, int len)
{
if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), phys, len)) {
- ext4_error(inode->i_sb, "check_block_validity",
+ ext4_error(inode->i_sb, msg,
"inode #%lu logical block %llu mapped to %llu "
"(size %d)", inode->i_ino,
(unsigned long long) logical,
(unsigned long long) phys, len);
- WARN_ON(1);
return -EIO;
}
return 0;
@@ -1170,8 +1190,8 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
up_read((&EXT4_I(inode)->i_data_sem));
if (retval > 0 && buffer_mapped(bh)) {
- int ret = check_block_validity(inode, block,
- bh->b_blocknr, retval);
+ int ret = check_block_validity(inode, "file system corruption",
+ block, bh->b_blocknr, retval);
if (ret != 0)
return ret;
}
@@ -1235,8 +1255,7 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
* i_data's format changing. Force the migrate
* to fail by clearing migrate flags
*/
- EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags &
- ~EXT4_EXT_MIGRATE;
+ EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE;
}
}
@@ -1252,8 +1271,9 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
up_write((&EXT4_I(inode)->i_data_sem));
if (retval > 0 && buffer_mapped(bh)) {
- int ret = check_block_validity(inode, block,
- bh->b_blocknr, retval);
+ int ret = check_block_validity(inode, "file system "
+ "corruption after allocation",
+ block, bh->b_blocknr, retval);
if (ret != 0)
return ret;
}
@@ -1863,18 +1883,6 @@ static void ext4_da_page_release_reservation(struct page *page,
* Delayed allocation stuff
*/
-struct mpage_da_data {
- struct inode *inode;
- sector_t b_blocknr; /* start block number of extent */
- size_t b_size; /* size of extent */
- unsigned long b_state; /* state of the extent */
- unsigned long first_page, next_page; /* extent of pages */
- struct writeback_control *wbc;
- int io_done;
- int pages_written;
- int retval;
-};
-
/*
* mpage_da_submit_io - walks through extent of pages and try to write
* them with writepage() call back
@@ -2329,7 +2337,7 @@ static int __mpage_da_writepage(struct page *page,
/*
* Rest of the page in the page_vec
* redirty then and skip then. We will
- * try to to write them again after
+ * try to write them again after
* starting a new transaction
*/
redirty_page_for_writepage(wbc, page);
@@ -2737,6 +2745,7 @@ static int ext4_da_writepages(struct address_space *mapping,
long pages_skipped;
int range_cyclic, cycled = 1, io_done = 0;
int needed_blocks, ret = 0, nr_to_writebump = 0;
+ loff_t range_start = wbc->range_start;
struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
trace_ext4_da_writepages(inode, wbc);
@@ -2850,6 +2859,7 @@ retry:
mpd.io_done = 1;
ret = MPAGE_DA_EXTENT_TAIL;
}
+ trace_ext4_da_write_pages(inode, &mpd);
wbc->nr_to_write -= mpd.pages_written;
ext4_journal_stop(handle);
@@ -2905,6 +2915,7 @@ out_writepages:
if (!no_nrwrite_index_update)
wbc->no_nrwrite_index_update = 0;
wbc->nr_to_write -= nr_to_writebump;
+ wbc->range_start = range_start;
trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
return ret;
}
@@ -3117,6 +3128,8 @@ out:
*/
int ext4_alloc_da_blocks(struct inode *inode)
{
+ trace_ext4_alloc_da_blocks(inode);
+
if (!EXT4_I(inode)->i_reserved_data_blocks &&
!EXT4_I(inode)->i_reserved_meta_blocks)
return 0;
@@ -3659,7 +3672,8 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
ext4_handle_dirty_metadata(handle, inode, bh);
}
ext4_mark_inode_dirty(handle, inode);
- ext4_journal_test_restart(handle, inode);
+ ext4_truncate_restart_trans(handle, inode,
+ blocks_for_truncate(inode));
if (bh) {
BUFFER_TRACE(bh, "retaking write access");
ext4_journal_get_write_access(handle, bh);
@@ -3870,7 +3884,8 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
return;
if (try_to_extend_transaction(handle, inode)) {
ext4_mark_inode_dirty(handle, inode);
- ext4_journal_test_restart(handle, inode);
+ ext4_truncate_restart_trans(handle, inode,
+ blocks_for_truncate(inode));
}
ext4_free_blocks(handle, inode, nr, 1, 1);
@@ -3958,8 +3973,7 @@ void ext4_truncate(struct inode *inode)
if (!ext4_can_truncate(inode))
return;
- if (ei->i_disksize && inode->i_size == 0 &&
- !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
+ if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE;
if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
@@ -4533,7 +4547,8 @@ static int ext4_inode_blocks_set(handle_t *handle,
*/
static int ext4_do_update_inode(handle_t *handle,
struct inode *inode,
- struct ext4_iloc *iloc)
+ struct ext4_iloc *iloc,
+ int do_sync)
{
struct ext4_inode *raw_inode = ext4_raw_inode(iloc);
struct ext4_inode_info *ei = EXT4_I(inode);
@@ -4581,8 +4596,7 @@ static int ext4_do_update_inode(handle_t *handle,
if (ext4_inode_blocks_set(handle, raw_inode, ei))
goto out_brelse;
raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
- /* clear the migrate flag in the raw_inode */
- raw_inode->i_flags = cpu_to_le32(ei->i_flags & ~EXT4_EXT_MIGRATE);
+ raw_inode->i_flags = cpu_to_le32(ei->i_flags);
if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
cpu_to_le32(EXT4_OS_HURD))
raw_inode->i_file_acl_high =
@@ -4635,10 +4649,22 @@ static int ext4_do_update_inode(handle_t *handle,
raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
}
- BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
- rc = ext4_handle_dirty_metadata(handle, inode, bh);
- if (!err)
- err = rc;
+ /*
+ * If we're not using a journal and we were called from
+ * ext4_write_inode() to sync the inode (making do_sync true),
+ * we can just use sync_dirty_buffer() directly to do our dirty
+ * work. Testing s_journal here is a bit redundant but it's
+ * worth it to avoid potential future trouble.
+ */
+ if (EXT4_SB(inode->i_sb)->s_journal == NULL && do_sync) {
+ BUFFER_TRACE(bh, "call sync_dirty_buffer");
+ sync_dirty_buffer(bh);
+ } else {
+ BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+ rc = ext4_handle_dirty_metadata(handle, inode, bh);
+ if (!err)
+ err = rc;
+ }
ei->i_state &= ~EXT4_STATE_NEW;
out_brelse:
@@ -4684,19 +4710,32 @@ out_brelse:
*/
int ext4_write_inode(struct inode *inode, int wait)
{
+ int err;
+
if (current->flags & PF_MEMALLOC)
return 0;
- if (ext4_journal_current_handle()) {
- jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n");
- dump_stack();
- return -EIO;
- }
+ if (EXT4_SB(inode->i_sb)->s_journal) {
+ if (ext4_journal_current_handle()) {
+ jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n");
+ dump_stack();
+ return -EIO;
+ }
- if (!wait)
- return 0;
+ if (!wait)
+ return 0;
+
+ err = ext4_force_commit(inode->i_sb);
+ } else {
+ struct ext4_iloc iloc;
- return ext4_force_commit(inode->i_sb);
+ err = ext4_get_inode_loc(inode, &iloc);
+ if (err)
+ return err;
+ err = ext4_do_update_inode(EXT4_NOJOURNAL_HANDLE,
+ inode, &iloc, wait);
+ }
+ return err;
}
/*
@@ -4990,7 +5029,7 @@ int ext4_mark_iloc_dirty(handle_t *handle,
get_bh(iloc->bh);
/* ext4_do_update_inode() does jbd2_journal_dirty_metadata */
- err = ext4_do_update_inode(handle, inode, iloc);
+ err = ext4_do_update_inode(handle, inode, iloc, 0);
put_bh(iloc->bh);
return err;
}
@@ -5281,12 +5320,21 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
else
len = PAGE_CACHE_SIZE;
+ lock_page(page);
+ /*
+ * return if we have all the buffers mapped. This avoid
+ * the need to call write_begin/write_end which does a
+ * journal_start/journal_stop which can block and take
+ * long time
+ */
if (page_has_buffers(page)) {
- /* return if we have all the buffers mapped */
if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
- ext4_bh_unmapped))
+ ext4_bh_unmapped)) {
+ unlock_page(page);
goto out_unlock;
+ }
}
+ unlock_page(page);
/*
* OK, we need to fill the hole... Do write_begin write_end
* to do block allocation/reservation.We are not holding
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 7050a9cd04a4..c1cdf613e725 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -243,10 +243,9 @@ setversion_out:
me.donor_start, me.len, &me.moved_len);
fput(donor_filp);
- if (!err)
- if (copy_to_user((struct move_extent *)arg,
- &me, sizeof(me)))
- return -EFAULT;
+ if (copy_to_user((struct move_extent *)arg, &me, sizeof(me)))
+ return -EFAULT;
+
return err;
}
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index cd258463e2a9..e9c61896d605 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -22,6 +22,7 @@
*/
#include "mballoc.h"
+#include <linux/debugfs.h>
#include <trace/events/ext4.h>
/*
@@ -622,13 +623,13 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
/* FIXME!! need more doc */
static void ext4_mb_mark_free_simple(struct super_block *sb,
- void *buddy, unsigned first, int len,
+ void *buddy, ext4_grpblk_t first, ext4_grpblk_t len,
struct ext4_group_info *grp)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
- unsigned short min;
- unsigned short max;
- unsigned short chunk;
+ ext4_grpblk_t min;
+ ext4_grpblk_t max;
+ ext4_grpblk_t chunk;
unsigned short border;
BUG_ON(len > EXT4_BLOCKS_PER_GROUP(sb));
@@ -662,10 +663,10 @@ void ext4_mb_generate_buddy(struct super_block *sb,
void *buddy, void *bitmap, ext4_group_t group)
{
struct ext4_group_info *grp = ext4_get_group_info(sb, group);
- unsigned short max = EXT4_BLOCKS_PER_GROUP(sb);
- unsigned short i = 0;
- unsigned short first;
- unsigned short len;
+ ext4_grpblk_t max = EXT4_BLOCKS_PER_GROUP(sb);
+ ext4_grpblk_t i = 0;
+ ext4_grpblk_t first;
+ ext4_grpblk_t len;
unsigned free = 0;
unsigned fragments = 0;
unsigned long long period = get_cycles();
@@ -743,7 +744,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
char *data;
char *bitmap;
- mb_debug("init page %lu\n", page->index);
+ mb_debug(1, "init page %lu\n", page->index);
inode = page->mapping->host;
sb = inode->i_sb;
@@ -822,7 +823,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
set_bitmap_uptodate(bh[i]);
bh[i]->b_end_io = end_buffer_read_sync;
submit_bh(READ, bh[i]);
- mb_debug("read bitmap for group %u\n", first_group + i);
+ mb_debug(1, "read bitmap for group %u\n", first_group + i);
}
/* wait for I/O completion */
@@ -862,12 +863,13 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
if ((first_block + i) & 1) {
/* this is block of buddy */
BUG_ON(incore == NULL);
- mb_debug("put buddy for group %u in page %lu/%x\n",
+ mb_debug(1, "put buddy for group %u in page %lu/%x\n",
group, page->index, i * blocksize);
grinfo = ext4_get_group_info(sb, group);
grinfo->bb_fragments = 0;
memset(grinfo->bb_counters, 0,
- sizeof(unsigned short)*(sb->s_blocksize_bits+2));
+ sizeof(*grinfo->bb_counters) *
+ (sb->s_blocksize_bits+2));
/*
* incore got set to the group block bitmap below
*/
@@ -878,7 +880,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
} else {
/* this is block of bitmap */
BUG_ON(incore != NULL);
- mb_debug("put bitmap for group %u in page %lu/%x\n",
+ mb_debug(1, "put bitmap for group %u in page %lu/%x\n",
group, page->index, i * blocksize);
/* see comments in ext4_mb_put_pa() */
@@ -908,6 +910,100 @@ out:
return err;
}
+static noinline_for_stack
+int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
+{
+
+ int ret = 0;
+ void *bitmap;
+ int blocks_per_page;
+ int block, pnum, poff;
+ int num_grp_locked = 0;
+ struct ext4_group_info *this_grp;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct inode *inode = sbi->s_buddy_cache;
+ struct page *page = NULL, *bitmap_page = NULL;
+
+ mb_debug(1, "init group %u\n", group);
+ blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+ this_grp = ext4_get_group_info(sb, group);
+ /*
+ * This ensures that we don't reinit the buddy cache
+ * page which map to the group from which we are already
+ * allocating. If we are looking at the buddy cache we would
+ * have taken a reference using ext4_mb_load_buddy and that
+ * would have taken the alloc_sem lock.
+ */
+ num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, group);
+ if (!EXT4_MB_GRP_NEED_INIT(this_grp)) {
+ /*
+ * somebody initialized the group
+ * return without doing anything
+ */
+ ret = 0;
+ goto err;
+ }
+ /*
+ * the buddy cache inode stores the block bitmap
+ * and buddy information in consecutive blocks.
+ * So for each group we need two blocks.
+ */
+ block = group * 2;
+ pnum = block / blocks_per_page;
+ poff = block % blocks_per_page;
+ page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+ if (page) {
+ BUG_ON(page->mapping != inode->i_mapping);
+ ret = ext4_mb_init_cache(page, NULL);
+ if (ret) {
+ unlock_page(page);
+ goto err;
+ }
+ unlock_page(page);
+ }
+ if (page == NULL || !PageUptodate(page)) {
+ ret = -EIO;
+ goto err;
+ }
+ mark_page_accessed(page);
+ bitmap_page = page;
+ bitmap = page_address(page) + (poff * sb->s_blocksize);
+
+ /* init buddy cache */
+ block++;
+ pnum = block / blocks_per_page;
+ poff = block % blocks_per_page;
+ page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+ if (page == bitmap_page) {
+ /*
+ * If both the bitmap and buddy are in
+ * the same page we don't need to force
+ * init the buddy
+ */
+ unlock_page(page);
+ } else if (page) {
+ BUG_ON(page->mapping != inode->i_mapping);
+ ret = ext4_mb_init_cache(page, bitmap);
+ if (ret) {
+ unlock_page(page);
+ goto err;
+ }
+ unlock_page(page);
+ }
+ if (page == NULL || !PageUptodate(page)) {
+ ret = -EIO;
+ goto err;
+ }
+ mark_page_accessed(page);
+err:
+ ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked);
+ if (bitmap_page)
+ page_cache_release(bitmap_page);
+ if (page)
+ page_cache_release(page);
+ return ret;
+}
+
static noinline_for_stack int
ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
struct ext4_buddy *e4b)
@@ -922,7 +1018,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct inode *inode = sbi->s_buddy_cache;
- mb_debug("load group %u\n", group);
+ mb_debug(1, "load group %u\n", group);
blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
grp = ext4_get_group_info(sb, group);
@@ -941,8 +1037,26 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
* groups mapped by the page is blocked
* till we are done with allocation
*/
+repeat_load_buddy:
down_read(e4b->alloc_semp);
+ if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
+ /* we need to check for group need init flag
+ * with alloc_semp held so that we can be sure
+ * that new blocks didn't get added to the group
+ * when we are loading the buddy cache
+ */
+ up_read(e4b->alloc_semp);
+ /*
+ * we need full data about the group
+ * to make a good selection
+ */
+ ret = ext4_mb_init_group(sb, group);
+ if (ret)
+ return ret;
+ goto repeat_load_buddy;
+ }
+
/*
* the buddy cache inode stores the block bitmap
* and buddy information in consecutive blocks.
@@ -1360,7 +1474,7 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
ac->alloc_semp = e4b->alloc_semp;
e4b->alloc_semp = NULL;
/* store last allocated for subsequent stream allocation */
- if ((ac->ac_flags & EXT4_MB_HINT_DATA)) {
+ if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
spin_lock(&sbi->s_md_lock);
sbi->s_mb_last_group = ac->ac_f_ex.fe_group;
sbi->s_mb_last_start = ac->ac_f_ex.fe_start;
@@ -1837,97 +1951,6 @@ void ext4_mb_put_buddy_cache_lock(struct super_block *sb,
}
-static noinline_for_stack
-int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
-{
-
- int ret;
- void *bitmap;
- int blocks_per_page;
- int block, pnum, poff;
- int num_grp_locked = 0;
- struct ext4_group_info *this_grp;
- struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct inode *inode = sbi->s_buddy_cache;
- struct page *page = NULL, *bitmap_page = NULL;
-
- mb_debug("init group %lu\n", group);
- blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
- this_grp = ext4_get_group_info(sb, group);
- /*
- * This ensures we don't add group
- * to this buddy cache via resize
- */
- num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, group);
- if (!EXT4_MB_GRP_NEED_INIT(this_grp)) {
- /*
- * somebody initialized the group
- * return without doing anything
- */
- ret = 0;
- goto err;
- }
- /*
- * the buddy cache inode stores the block bitmap
- * and buddy information in consecutive blocks.
- * So for each group we need two blocks.
- */
- block = group * 2;
- pnum = block / blocks_per_page;
- poff = block % blocks_per_page;
- page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
- if (page) {
- BUG_ON(page->mapping != inode->i_mapping);
- ret = ext4_mb_init_cache(page, NULL);
- if (ret) {
- unlock_page(page);
- goto err;
- }
- unlock_page(page);
- }
- if (page == NULL || !PageUptodate(page)) {
- ret = -EIO;
- goto err;
- }
- mark_page_accessed(page);
- bitmap_page = page;
- bitmap = page_address(page) + (poff * sb->s_blocksize);
-
- /* init buddy cache */
- block++;
- pnum = block / blocks_per_page;
- poff = block % blocks_per_page;
- page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
- if (page == bitmap_page) {
- /*
- * If both the bitmap and buddy are in
- * the same page we don't need to force
- * init the buddy
- */
- unlock_page(page);
- } else if (page) {
- BUG_ON(page->mapping != inode->i_mapping);
- ret = ext4_mb_init_cache(page, bitmap);
- if (ret) {
- unlock_page(page);
- goto err;
- }
- unlock_page(page);
- }
- if (page == NULL || !PageUptodate(page)) {
- ret = -EIO;
- goto err;
- }
- mark_page_accessed(page);
-err:
- ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked);
- if (bitmap_page)
- page_cache_release(bitmap_page);
- if (page)
- page_cache_release(page);
- return ret;
-}
-
static noinline_for_stack int
ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
{
@@ -1938,11 +1961,14 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
struct ext4_sb_info *sbi;
struct super_block *sb;
struct ext4_buddy e4b;
- loff_t size, isize;
sb = ac->ac_sb;
sbi = EXT4_SB(sb);
ngroups = ext4_get_groups_count(sb);
+ /* non-extent files are limited to low blocks/groups */
+ if (!(EXT4_I(ac->ac_inode)->i_flags & EXT4_EXTENTS_FL))
+ ngroups = sbi->s_blockfile_groups;
+
BUG_ON(ac->ac_status == AC_STATUS_FOUND);
/* first, try the goal */
@@ -1974,20 +2000,16 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
}
bsbits = ac->ac_sb->s_blocksize_bits;
- /* if stream allocation is enabled, use global goal */
- size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
- isize = i_size_read(ac->ac_inode) >> bsbits;
- if (size < isize)
- size = isize;
- if (size < sbi->s_mb_stream_request &&
- (ac->ac_flags & EXT4_MB_HINT_DATA)) {
+ /* if stream allocation is enabled, use global goal */
+ if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
/* TBD: may be hot point */
spin_lock(&sbi->s_md_lock);
ac->ac_g_ex.fe_group = sbi->s_mb_last_group;
ac->ac_g_ex.fe_start = sbi->s_mb_last_start;
spin_unlock(&sbi->s_md_lock);
}
+
/* Let's just scan groups to find more-less suitable blocks */
cr = ac->ac_2order ? 0 : 1;
/*
@@ -2015,27 +2037,6 @@ repeat:
if (grp->bb_free == 0)
continue;
- /*
- * if the group is already init we check whether it is
- * a good group and if not we don't load the buddy
- */
- if (EXT4_MB_GRP_NEED_INIT(grp)) {
- /*
- * we need full data about the group
- * to make a good selection
- */
- err = ext4_mb_init_group(sb, group);
- if (err)
- goto out;
- }
-
- /*
- * If the particular group doesn't satisfy our
- * criteria we continue with the next group
- */
- if (!ext4_mb_good_group(ac, group, cr))
- continue;
-
err = ext4_mb_load_buddy(sb, group, &e4b);
if (err)
goto out;
@@ -2156,7 +2157,7 @@ static int ext4_mb_seq_history_show(struct seq_file *seq, void *v)
if (v == SEQ_START_TOKEN) {
seq_printf(seq, "%-5s %-8s %-23s %-23s %-23s %-5s "
- "%-5s %-2s %-5s %-5s %-5s %-6s\n",
+ "%-5s %-2s %-6s %-5s %-5s %-6s\n",
"pid", "inode", "original", "goal", "result", "found",
"grps", "cr", "flags", "merge", "tail", "broken");
return 0;
@@ -2164,7 +2165,7 @@ static int ext4_mb_seq_history_show(struct seq_file *seq, void *v)
if (hs->op == EXT4_MB_HISTORY_ALLOC) {
fmt = "%-5u %-8u %-23s %-23s %-23s %-5u %-5u %-2u "
- "%-5u %-5s %-5u %-6u\n";
+ "0x%04x %-5s %-5u %-6u\n";
sprintf(buf2, "%u/%d/%u@%u", hs->result.fe_group,
hs->result.fe_start, hs->result.fe_len,
hs->result.fe_logical);
@@ -2205,7 +2206,7 @@ static void ext4_mb_seq_history_stop(struct seq_file *seq, void *v)
{
}
-static struct seq_operations ext4_mb_seq_history_ops = {
+static const struct seq_operations ext4_mb_seq_history_ops = {
.start = ext4_mb_seq_history_start,
.next = ext4_mb_seq_history_next,
.stop = ext4_mb_seq_history_stop,
@@ -2287,7 +2288,7 @@ static ssize_t ext4_mb_seq_history_write(struct file *file,
return count;
}
-static struct file_operations ext4_mb_seq_history_fops = {
+static const struct file_operations ext4_mb_seq_history_fops = {
.owner = THIS_MODULE,
.open = ext4_mb_seq_history_open,
.read = seq_read,
@@ -2328,7 +2329,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
struct ext4_buddy e4b;
struct sg {
struct ext4_group_info info;
- unsigned short counters[16];
+ ext4_grpblk_t counters[16];
} sg;
group--;
@@ -2366,7 +2367,7 @@ static void ext4_mb_seq_groups_stop(struct seq_file *seq, void *v)
{
}
-static struct seq_operations ext4_mb_seq_groups_ops = {
+static const struct seq_operations ext4_mb_seq_groups_ops = {
.start = ext4_mb_seq_groups_start,
.next = ext4_mb_seq_groups_next,
.stop = ext4_mb_seq_groups_stop,
@@ -2387,7 +2388,7 @@ static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file)
}
-static struct file_operations ext4_mb_seq_groups_fops = {
+static const struct file_operations ext4_mb_seq_groups_fops = {
.owner = THIS_MODULE,
.open = ext4_mb_seq_groups_open,
.read = seq_read,
@@ -2532,7 +2533,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
init_rwsem(&meta_group_info[i]->alloc_sem);
- meta_group_info[i]->bb_free_root.rb_node = NULL;;
+ meta_group_info[i]->bb_free_root.rb_node = NULL;
#ifdef DOUBLE_CHECK
{
@@ -2558,26 +2559,15 @@ exit_meta_group_info:
return -ENOMEM;
} /* ext4_mb_add_groupinfo */
-/*
- * Update an existing group.
- * This function is used for online resize
- */
-void ext4_mb_update_group_info(struct ext4_group_info *grp, ext4_grpblk_t add)
-{
- grp->bb_free += add;
-}
-
static int ext4_mb_init_backend(struct super_block *sb)
{
ext4_group_t ngroups = ext4_get_groups_count(sb);
ext4_group_t i;
- int metalen;
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_super_block *es = sbi->s_es;
int num_meta_group_infos;
int num_meta_group_infos_max;
int array_size;
- struct ext4_group_info **meta_group_info;
struct ext4_group_desc *desc;
/* This is the number of blocks used by GDT */
@@ -2622,22 +2612,6 @@ static int ext4_mb_init_backend(struct super_block *sb)
goto err_freesgi;
}
EXT4_I(sbi->s_buddy_cache)->i_disksize = 0;
-
- metalen = sizeof(*meta_group_info) << EXT4_DESC_PER_BLOCK_BITS(sb);
- for (i = 0; i < num_meta_group_infos; i++) {
- if ((i + 1) == num_meta_group_infos)
- metalen = sizeof(*meta_group_info) *
- (ngroups -
- (i << EXT4_DESC_PER_BLOCK_BITS(sb)));
- meta_group_info = kmalloc(metalen, GFP_KERNEL);
- if (meta_group_info == NULL) {
- printk(KERN_ERR "EXT4-fs: can't allocate mem for a "
- "buddy group\n");
- goto err_freemeta;
- }
- sbi->s_group_info[i] = meta_group_info;
- }
-
for (i = 0; i < ngroups; i++) {
desc = ext4_get_group_desc(sb, i, NULL);
if (desc == NULL) {
@@ -2655,7 +2629,6 @@ err_freebuddy:
while (i-- > 0)
kfree(ext4_get_group_info(sb, i));
i = num_meta_group_infos;
-err_freemeta:
while (i-- > 0)
kfree(sbi->s_group_info[i]);
iput(sbi->s_buddy_cache);
@@ -2672,14 +2645,14 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
unsigned max;
int ret;
- i = (sb->s_blocksize_bits + 2) * sizeof(unsigned short);
+ i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_offsets);
sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL);
if (sbi->s_mb_offsets == NULL) {
return -ENOMEM;
}
- i = (sb->s_blocksize_bits + 2) * sizeof(unsigned int);
+ i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_maxs);
sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
if (sbi->s_mb_maxs == NULL) {
kfree(sbi->s_mb_offsets);
@@ -2758,7 +2731,7 @@ static void ext4_mb_cleanup_pa(struct ext4_group_info *grp)
kmem_cache_free(ext4_pspace_cachep, pa);
}
if (count)
- mb_debug("mballoc: %u PAs left\n", count);
+ mb_debug(1, "mballoc: %u PAs left\n", count);
}
@@ -2839,7 +2812,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
list_for_each_safe(l, ltmp, &txn->t_private_list) {
entry = list_entry(l, struct ext4_free_data, list);
- mb_debug("gonna free %u blocks in group %u (0x%p):",
+ mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
entry->count, entry->group, entry);
err = ext4_mb_load_buddy(sb, entry->group, &e4b);
@@ -2874,9 +2847,43 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
ext4_mb_release_desc(&e4b);
}
- mb_debug("freed %u blocks in %u structures\n", count, count2);
+ mb_debug(1, "freed %u blocks in %u structures\n", count, count2);
+}
+
+#ifdef CONFIG_EXT4_DEBUG
+u8 mb_enable_debug __read_mostly;
+
+static struct dentry *debugfs_dir;
+static struct dentry *debugfs_debug;
+
+static void __init ext4_create_debugfs_entry(void)
+{
+ debugfs_dir = debugfs_create_dir("ext4", NULL);
+ if (debugfs_dir)
+ debugfs_debug = debugfs_create_u8("mballoc-debug",
+ S_IRUGO | S_IWUSR,
+ debugfs_dir,
+ &mb_enable_debug);
+}
+
+static void ext4_remove_debugfs_entry(void)
+{
+ debugfs_remove(debugfs_debug);
+ debugfs_remove(debugfs_dir);
}
+#else
+
+static void __init ext4_create_debugfs_entry(void)
+{
+}
+
+static void ext4_remove_debugfs_entry(void)
+{
+}
+
+#endif
+
int __init init_ext4_mballoc(void)
{
ext4_pspace_cachep =
@@ -2904,6 +2911,7 @@ int __init init_ext4_mballoc(void)
kmem_cache_destroy(ext4_ac_cachep);
return -ENOMEM;
}
+ ext4_create_debugfs_entry();
return 0;
}
@@ -2917,6 +2925,7 @@ void exit_ext4_mballoc(void)
kmem_cache_destroy(ext4_pspace_cachep);
kmem_cache_destroy(ext4_ac_cachep);
kmem_cache_destroy(ext4_free_ext_cachep);
+ ext4_remove_debugfs_entry();
}
@@ -3061,7 +3070,7 @@ static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac)
ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_stripe;
else
ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc;
- mb_debug("#%u: goal %u blocks for locality group\n",
+ mb_debug(1, "#%u: goal %u blocks for locality group\n",
current->pid, ac->ac_g_ex.fe_len);
}
@@ -3180,23 +3189,18 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
BUG_ON(!(ac->ac_o_ex.fe_logical >= pa_end ||
ac->ac_o_ex.fe_logical < pa->pa_lstart));
- /* skip PA normalized request doesn't overlap with */
- if (pa->pa_lstart >= end) {
- spin_unlock(&pa->pa_lock);
- continue;
- }
- if (pa_end <= start) {
+ /* skip PAs this normalized request doesn't overlap with */
+ if (pa->pa_lstart >= end || pa_end <= start) {
spin_unlock(&pa->pa_lock);
continue;
}
BUG_ON(pa->pa_lstart <= start && pa_end >= end);
+ /* adjust start or end to be adjacent to this pa */
if (pa_end <= ac->ac_o_ex.fe_logical) {
BUG_ON(pa_end < start);
start = pa_end;
- }
-
- if (pa->pa_lstart > ac->ac_o_ex.fe_logical) {
+ } else if (pa->pa_lstart > ac->ac_o_ex.fe_logical) {
BUG_ON(pa->pa_lstart > end);
end = pa->pa_lstart;
}
@@ -3251,7 +3255,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
}
- mb_debug("goal: %u(was %u) blocks at %u\n", (unsigned) size,
+ mb_debug(1, "goal: %u(was %u) blocks at %u\n", (unsigned) size,
(unsigned) orig_size, (unsigned) start);
}
@@ -3300,7 +3304,7 @@ static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac,
BUG_ON(pa->pa_free < len);
pa->pa_free -= len;
- mb_debug("use %llu/%u from inode pa %p\n", start, len, pa);
+ mb_debug(1, "use %llu/%u from inode pa %p\n", start, len, pa);
}
/*
@@ -3324,7 +3328,7 @@ static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac,
* in on-disk bitmap -- see ext4_mb_release_context()
* Other CPUs are prevented from allocating from this pa by lg_mutex
*/
- mb_debug("use %u/%u from group pa %p\n", pa->pa_lstart-len, len, pa);
+ mb_debug(1, "use %u/%u from group pa %p\n", pa->pa_lstart-len, len, pa);
}
/*
@@ -3382,6 +3386,11 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
ac->ac_o_ex.fe_logical >= pa->pa_lstart + pa->pa_len)
continue;
+ /* non-extent files can't have physical blocks past 2^32 */
+ if (!(EXT4_I(ac->ac_inode)->i_flags & EXT4_EXTENTS_FL) &&
+ pa->pa_pstart + pa->pa_len > EXT4_MAX_BLOCK_FILE_PHYS)
+ continue;
+
/* found preallocated blocks, use them */
spin_lock(&pa->pa_lock);
if (pa->pa_deleted == 0 && pa->pa_free) {
@@ -3503,7 +3512,7 @@ void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
preallocated += len;
count++;
}
- mb_debug("prellocated %u for group %u\n", preallocated, group);
+ mb_debug(1, "prellocated %u for group %u\n", preallocated, group);
}
static void ext4_mb_pa_callback(struct rcu_head *head)
@@ -3638,7 +3647,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
pa->pa_deleted = 0;
pa->pa_type = MB_INODE_PA;
- mb_debug("new inode pa %p: %llu/%u for %u\n", pa,
+ mb_debug(1, "new inode pa %p: %llu/%u for %u\n", pa,
pa->pa_pstart, pa->pa_len, pa->pa_lstart);
trace_ext4_mb_new_inode_pa(ac, pa);
@@ -3698,7 +3707,7 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
pa->pa_deleted = 0;
pa->pa_type = MB_GROUP_PA;
- mb_debug("new group pa %p: %llu/%u for %u\n", pa,
+ mb_debug(1, "new group pa %p: %llu/%u for %u\n", pa,
pa->pa_pstart, pa->pa_len, pa->pa_lstart);
trace_ext4_mb_new_group_pa(ac, pa);
@@ -3777,7 +3786,7 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
next = mb_find_next_bit(bitmap_bh->b_data, end, bit);
start = group * EXT4_BLOCKS_PER_GROUP(sb) + bit +
le32_to_cpu(sbi->s_es->s_first_data_block);
- mb_debug(" free preallocated %u/%u in group %u\n",
+ mb_debug(1, " free preallocated %u/%u in group %u\n",
(unsigned) start, (unsigned) next - bit,
(unsigned) group);
free += next - bit;
@@ -3868,7 +3877,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
int busy = 0;
int free = 0;
- mb_debug("discard preallocation for group %u\n", group);
+ mb_debug(1, "discard preallocation for group %u\n", group);
if (list_empty(&grp->bb_prealloc_list))
return 0;
@@ -3992,7 +4001,7 @@ void ext4_discard_preallocations(struct inode *inode)
return;
}
- mb_debug("discard preallocation for inode %lu\n", inode->i_ino);
+ mb_debug(1, "discard preallocation for inode %lu\n", inode->i_ino);
trace_ext4_discard_preallocations(inode);
INIT_LIST_HEAD(&list);
@@ -4097,7 +4106,7 @@ static void ext4_mb_return_to_preallocation(struct inode *inode,
{
BUG_ON(!list_empty(&EXT4_I(inode)->i_prealloc_list));
}
-#ifdef MB_DEBUG
+#ifdef CONFIG_EXT4_DEBUG
static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
{
struct super_block *sb = ac->ac_sb;
@@ -4139,14 +4148,14 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
ext4_get_group_no_and_offset(sb, pa->pa_pstart,
NULL, &start);
spin_unlock(&pa->pa_lock);
- printk(KERN_ERR "PA:%lu:%d:%u \n", i,
- start, pa->pa_len);
+ printk(KERN_ERR "PA:%u:%d:%u \n", i,
+ start, pa->pa_len);
}
ext4_unlock_group(sb, i);
if (grp->bb_free == 0)
continue;
- printk(KERN_ERR "%lu: %d/%d \n",
+ printk(KERN_ERR "%u: %d/%d \n",
i, grp->bb_free, grp->bb_fragments);
}
printk(KERN_ERR "\n");
@@ -4174,16 +4183,26 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
return;
+ if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
+ return;
+
size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
- isize = i_size_read(ac->ac_inode) >> bsbits;
+ isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1)
+ >> bsbits;
size = max(size, isize);
- /* don't use group allocation for large files */
- if (size >= sbi->s_mb_stream_request)
+ if ((size == isize) &&
+ !ext4_fs_is_busy(sbi) &&
+ (atomic_read(&ac->ac_inode->i_writecount) == 0)) {
+ ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC;
return;
+ }
- if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
+ /* don't use group allocation for large files */
+ if (size >= sbi->s_mb_stream_request) {
+ ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
return;
+ }
BUG_ON(ac->ac_lg != NULL);
/*
@@ -4246,7 +4265,7 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
* locality group. this is a policy, actually */
ext4_mb_group_or_file(ac);
- mb_debug("init ac: %u blocks @ %u, goal %u, flags %x, 2^%d, "
+ mb_debug(1, "init ac: %u blocks @ %u, goal %u, flags %x, 2^%d, "
"left: %u/%u, right %u/%u to %swritable\n",
(unsigned) ar->len, (unsigned) ar->logical,
(unsigned) ar->goal, ac->ac_flags, ac->ac_2order,
@@ -4268,7 +4287,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
struct ext4_prealloc_space *pa, *tmp;
struct ext4_allocation_context *ac;
- mb_debug("discard locality group preallocation\n");
+ mb_debug(1, "discard locality group preallocation\n");
INIT_LIST_HEAD(&discard_list);
ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index c96bb19f58f9..188d3d709b24 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -37,11 +37,19 @@
/*
*/
-#define MB_DEBUG__
-#ifdef MB_DEBUG
-#define mb_debug(fmt, a...) printk(fmt, ##a)
+#ifdef CONFIG_EXT4_DEBUG
+extern u8 mb_enable_debug;
+
+#define mb_debug(n, fmt, a...) \
+ do { \
+ if ((n) <= mb_enable_debug) { \
+ printk(KERN_DEBUG "(%s, %d): %s: ", \
+ __FILE__, __LINE__, __func__); \
+ printk(fmt, ## a); \
+ } \
+ } while (0)
#else
-#define mb_debug(fmt, a...)
+#define mb_debug(n, fmt, a...)
#endif
/*
@@ -128,8 +136,8 @@ struct ext4_prealloc_space {
unsigned pa_deleted;
ext4_fsblk_t pa_pstart; /* phys. block */
ext4_lblk_t pa_lstart; /* log. block */
- unsigned short pa_len; /* len of preallocated chunk */
- unsigned short pa_free; /* how many blocks are free */
+ ext4_grpblk_t pa_len; /* len of preallocated chunk */
+ ext4_grpblk_t pa_free; /* how many blocks are free */
unsigned short pa_type; /* pa type. inode or group */
spinlock_t *pa_obj_lock;
struct inode *pa_inode; /* hack, for history only */
@@ -144,7 +152,7 @@ struct ext4_free_extent {
ext4_lblk_t fe_logical;
ext4_grpblk_t fe_start;
ext4_group_t fe_group;
- int fe_len;
+ ext4_grpblk_t fe_len;
};
/*
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 313a50b39741..bf519f239ae6 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -353,17 +353,16 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode,
down_write(&EXT4_I(inode)->i_data_sem);
/*
- * if EXT4_EXT_MIGRATE is cleared a block allocation
+ * if EXT4_STATE_EXT_MIGRATE is cleared a block allocation
* happened after we started the migrate. We need to
* fail the migrate
*/
- if (!(EXT4_I(inode)->i_flags & EXT4_EXT_MIGRATE)) {
+ if (!(EXT4_I(inode)->i_state & EXT4_STATE_EXT_MIGRATE)) {
retval = -EAGAIN;
up_write(&EXT4_I(inode)->i_data_sem);
goto err_out;
} else
- EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags &
- ~EXT4_EXT_MIGRATE;
+ EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE;
/*
* We have the extent map build with the tmp inode.
* Now copy the i_data across
@@ -517,14 +516,15 @@ int ext4_ext_migrate(struct inode *inode)
* when we add extents we extent the journal
*/
/*
- * Even though we take i_mutex we can still cause block allocation
- * via mmap write to holes. If we have allocated new blocks we fail
- * migrate. New block allocation will clear EXT4_EXT_MIGRATE flag.
- * The flag is updated with i_data_sem held to prevent racing with
- * block allocation.
+ * Even though we take i_mutex we can still cause block
+ * allocation via mmap write to holes. If we have allocated
+ * new blocks we fail migrate. New block allocation will
+ * clear EXT4_STATE_EXT_MIGRATE flag. The flag is updated
+ * with i_data_sem held to prevent racing with block
+ * allocation.
*/
down_read((&EXT4_I(inode)->i_data_sem));
- EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags | EXT4_EXT_MIGRATE;
+ EXT4_I(inode)->i_state |= EXT4_STATE_EXT_MIGRATE;
up_read((&EXT4_I(inode)->i_data_sem));
handle = ext4_journal_start(inode, 1);
@@ -618,7 +618,7 @@ err_out:
tmp_inode->i_nlink = 0;
ext4_journal_stop(handle);
-
+ unlock_new_inode(tmp_inode);
iput(tmp_inode);
return retval;
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index bbf2dd9404dc..c07a2915e40b 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -19,14 +19,31 @@
#include "ext4_extents.h"
#include "ext4.h"
-#define get_ext_path(path, inode, block, ret) \
- do { \
- path = ext4_ext_find_extent(inode, block, path); \
- if (IS_ERR(path)) { \
- ret = PTR_ERR(path); \
- path = NULL; \
- } \
- } while (0)
+/**
+ * get_ext_path - Find an extent path for designated logical block number.
+ *
+ * @inode: an inode which is searched
+ * @lblock: logical block number to find an extent path
+ * @path: pointer to an extent path pointer (for output)
+ *
+ * ext4_ext_find_extent wrapper. Return 0 on success, or a negative error value
+ * on failure.
+ */
+static inline int
+get_ext_path(struct inode *inode, ext4_lblk_t lblock,
+ struct ext4_ext_path **path)
+{
+ int ret = 0;
+
+ *path = ext4_ext_find_extent(inode, lblock, *path);
+ if (IS_ERR(*path)) {
+ ret = PTR_ERR(*path);
+ *path = NULL;
+ } else if ((*path)[ext_depth(inode)].p_ext == NULL)
+ ret = -ENODATA;
+
+ return ret;
+}
/**
* copy_extent_status - Copy the extent's initialization status
@@ -113,6 +130,31 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
}
/**
+ * mext_check_null_inode - NULL check for two inodes
+ *
+ * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0.
+ */
+static int
+mext_check_null_inode(struct inode *inode1, struct inode *inode2,
+ const char *function)
+{
+ int ret = 0;
+
+ if (inode1 == NULL) {
+ ext4_error(inode2->i_sb, function,
+ "Both inodes should not be NULL: "
+ "inode1 NULL inode2 %lu", inode2->i_ino);
+ ret = -EIO;
+ } else if (inode2 == NULL) {
+ ext4_error(inode1->i_sb, function,
+ "Both inodes should not be NULL: "
+ "inode1 %lu inode2 NULL", inode1->i_ino);
+ ret = -EIO;
+ }
+ return ret;
+}
+
+/**
* mext_double_down_read - Acquire two inodes' read semaphore
*
* @orig_inode: original inode structure
@@ -124,8 +166,6 @@ mext_double_down_read(struct inode *orig_inode, struct inode *donor_inode)
{
struct inode *first = orig_inode, *second = donor_inode;
- BUG_ON(orig_inode == NULL || donor_inode == NULL);
-
/*
* Use the inode number to provide the stable locking order instead
* of its address, because the C language doesn't guarantee you can
@@ -152,8 +192,6 @@ mext_double_down_write(struct inode *orig_inode, struct inode *donor_inode)
{
struct inode *first = orig_inode, *second = donor_inode;
- BUG_ON(orig_inode == NULL || donor_inode == NULL);
-
/*
* Use the inode number to provide the stable locking order instead
* of its address, because the C language doesn't guarantee you can
@@ -178,8 +216,6 @@ mext_double_down_write(struct inode *orig_inode, struct inode *donor_inode)
static void
mext_double_up_read(struct inode *orig_inode, struct inode *donor_inode)
{
- BUG_ON(orig_inode == NULL || donor_inode == NULL);
-
up_read(&EXT4_I(orig_inode)->i_data_sem);
up_read(&EXT4_I(donor_inode)->i_data_sem);
}
@@ -194,8 +230,6 @@ mext_double_up_read(struct inode *orig_inode, struct inode *donor_inode)
static void
mext_double_up_write(struct inode *orig_inode, struct inode *donor_inode)
{
- BUG_ON(orig_inode == NULL || donor_inode == NULL);
-
up_write(&EXT4_I(orig_inode)->i_data_sem);
up_write(&EXT4_I(donor_inode)->i_data_sem);
}
@@ -283,8 +317,8 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
}
if (new_flag) {
- get_ext_path(orig_path, orig_inode, eblock, err);
- if (orig_path == NULL)
+ err = get_ext_path(orig_inode, eblock, &orig_path);
+ if (err)
goto out;
if (ext4_ext_insert_extent(handle, orig_inode,
@@ -293,9 +327,9 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
}
if (end_flag) {
- get_ext_path(orig_path, orig_inode,
- le32_to_cpu(end_ext->ee_block) - 1, err);
- if (orig_path == NULL)
+ err = get_ext_path(orig_inode,
+ le32_to_cpu(end_ext->ee_block) - 1, &orig_path);
+ if (err)
goto out;
if (ext4_ext_insert_extent(handle, orig_inode,
@@ -519,7 +553,15 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
* oext |-----------|
* new_ext |-------|
*/
- BUG_ON(le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end);
+ if (le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end) {
+ ext4_error(orig_inode->i_sb, __func__,
+ "new_ext_end(%u) should be less than or equal to "
+ "oext->ee_block(%u) + oext_alen(%d) - 1",
+ new_ext_end, le32_to_cpu(oext->ee_block),
+ oext_alen);
+ ret = -EIO;
+ goto out;
+ }
/*
* Case: new_ext is smaller than original extent
@@ -543,6 +585,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
ret = mext_insert_extents(handle, orig_inode, orig_path, o_start,
o_end, &start_ext, &new_ext, &end_ext);
+out:
return ret;
}
@@ -554,8 +597,10 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
* @orig_off: block offset of original inode
* @donor_off: block offset of donor inode
* @max_count: the maximun length of extents
+ *
+ * Return 0 on success, or a negative error value on failure.
*/
-static void
+static int
mext_calc_swap_extents(struct ext4_extent *tmp_dext,
struct ext4_extent *tmp_oext,
ext4_lblk_t orig_off, ext4_lblk_t donor_off,
@@ -564,6 +609,19 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext,
ext4_lblk_t diff, orig_diff;
struct ext4_extent dext_old, oext_old;
+ BUG_ON(orig_off != donor_off);
+
+ /* original and donor extents have to cover the same block offset */
+ if (orig_off < le32_to_cpu(tmp_oext->ee_block) ||
+ le32_to_cpu(tmp_oext->ee_block) +
+ ext4_ext_get_actual_len(tmp_oext) - 1 < orig_off)
+ return -ENODATA;
+
+ if (orig_off < le32_to_cpu(tmp_dext->ee_block) ||
+ le32_to_cpu(tmp_dext->ee_block) +
+ ext4_ext_get_actual_len(tmp_dext) - 1 < orig_off)
+ return -ENODATA;
+
dext_old = *tmp_dext;
oext_old = *tmp_oext;
@@ -591,6 +649,8 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext,
copy_extent_status(&oext_old, tmp_dext);
copy_extent_status(&dext_old, tmp_oext);
+
+ return 0;
}
/**
@@ -631,13 +691,13 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
mext_double_down_write(orig_inode, donor_inode);
/* Get the original extent for the block "orig_off" */
- get_ext_path(orig_path, orig_inode, orig_off, err);
- if (orig_path == NULL)
+ err = get_ext_path(orig_inode, orig_off, &orig_path);
+ if (err)
goto out;
/* Get the donor extent for the head */
- get_ext_path(donor_path, donor_inode, donor_off, err);
- if (donor_path == NULL)
+ err = get_ext_path(donor_inode, donor_off, &donor_path);
+ if (err)
goto out;
depth = ext_depth(orig_inode);
oext = orig_path[depth].p_ext;
@@ -647,13 +707,28 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
dext = donor_path[depth].p_ext;
tmp_dext = *dext;
- mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
+ err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
donor_off, count);
+ if (err)
+ goto out;
/* Loop for the donor extents */
while (1) {
/* The extent for donor must be found. */
- BUG_ON(!dext || donor_off != le32_to_cpu(tmp_dext.ee_block));
+ if (!dext) {
+ ext4_error(donor_inode->i_sb, __func__,
+ "The extent for donor must be found");
+ err = -EIO;
+ goto out;
+ } else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) {
+ ext4_error(donor_inode->i_sb, __func__,
+ "Donor offset(%u) and the first block of donor "
+ "extent(%u) should be equal",
+ donor_off,
+ le32_to_cpu(tmp_dext.ee_block));
+ err = -EIO;
+ goto out;
+ }
/* Set donor extent to orig extent */
err = mext_leaf_block(handle, orig_inode,
@@ -678,8 +753,8 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
if (orig_path)
ext4_ext_drop_refs(orig_path);
- get_ext_path(orig_path, orig_inode, orig_off, err);
- if (orig_path == NULL)
+ err = get_ext_path(orig_inode, orig_off, &orig_path);
+ if (err)
goto out;
depth = ext_depth(orig_inode);
oext = orig_path[depth].p_ext;
@@ -692,9 +767,8 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
if (donor_path)
ext4_ext_drop_refs(donor_path);
- get_ext_path(donor_path, donor_inode,
- donor_off, err);
- if (donor_path == NULL)
+ err = get_ext_path(donor_inode, donor_off, &donor_path);
+ if (err)
goto out;
depth = ext_depth(donor_inode);
dext = donor_path[depth].p_ext;
@@ -705,9 +779,10 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
}
tmp_dext = *dext;
- mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
- donor_off,
- count - replaced_count);
+ err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
+ donor_off, count - replaced_count);
+ if (err)
+ goto out;
}
out:
@@ -740,7 +815,7 @@ out:
* on success, or a negative error value on failure.
*/
static int
-move_extent_par_page(struct file *o_filp, struct inode *donor_inode,
+move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
pgoff_t orig_page_offset, int data_offset_in_page,
int block_len_in_page, int uninit)
{
@@ -871,6 +946,7 @@ out:
if (PageLocked(page))
unlock_page(page);
page_cache_release(page);
+ ext4_journal_stop(handle);
}
out2:
ext4_journal_stop(handle);
@@ -897,6 +973,10 @@ mext_check_arguments(struct inode *orig_inode,
struct inode *donor_inode, __u64 orig_start,
__u64 donor_start, __u64 *len, __u64 moved_len)
{
+ ext4_lblk_t orig_blocks, donor_blocks;
+ unsigned int blkbits = orig_inode->i_blkbits;
+ unsigned int blocksize = 1 << blkbits;
+
/* Regular file check */
if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) {
ext4_debug("ext4 move extent: The argument files should be "
@@ -960,54 +1040,58 @@ mext_check_arguments(struct inode *orig_inode,
return -EINVAL;
}
- if ((orig_start > MAX_DEFRAG_SIZE) ||
- (donor_start > MAX_DEFRAG_SIZE) ||
- (*len > MAX_DEFRAG_SIZE) ||
- (orig_start + *len > MAX_DEFRAG_SIZE)) {
- ext4_debug("ext4 move extent: Can't handle over [%lu] blocks "
- "[ino:orig %lu, donor %lu]\n", MAX_DEFRAG_SIZE,
+ if ((orig_start > EXT_MAX_BLOCK) ||
+ (donor_start > EXT_MAX_BLOCK) ||
+ (*len > EXT_MAX_BLOCK) ||
+ (orig_start + *len > EXT_MAX_BLOCK)) {
+ ext4_debug("ext4 move extent: Can't handle over [%u] blocks "
+ "[ino:orig %lu, donor %lu]\n", EXT_MAX_BLOCK,
orig_inode->i_ino, donor_inode->i_ino);
return -EINVAL;
}
if (orig_inode->i_size > donor_inode->i_size) {
- if (orig_start >= donor_inode->i_size) {
+ donor_blocks = (donor_inode->i_size + blocksize - 1) >> blkbits;
+ /* TODO: eliminate this artificial restriction */
+ if (orig_start >= donor_blocks) {
ext4_debug("ext4 move extent: orig start offset "
- "[%llu] should be less than donor file size "
- "[%lld] [ino:orig %lu, donor_inode %lu]\n",
- orig_start, donor_inode->i_size,
+ "[%llu] should be less than donor file blocks "
+ "[%u] [ino:orig %lu, donor %lu]\n",
+ orig_start, donor_blocks,
orig_inode->i_ino, donor_inode->i_ino);
return -EINVAL;
}
- if (orig_start + *len > donor_inode->i_size) {
+ /* TODO: eliminate this artificial restriction */
+ if (orig_start + *len > donor_blocks) {
ext4_debug("ext4 move extent: End offset [%llu] should "
- "be less than donor file size [%lld]."
- "So adjust length from %llu to %lld "
+ "be less than donor file blocks [%u]."
+ "So adjust length from %llu to %llu "
"[ino:orig %lu, donor %lu]\n",
- orig_start + *len, donor_inode->i_size,
- *len, donor_inode->i_size - orig_start,
+ orig_start + *len, donor_blocks,
+ *len, donor_blocks - orig_start,
orig_inode->i_ino, donor_inode->i_ino);
- *len = donor_inode->i_size - orig_start;
+ *len = donor_blocks - orig_start;
}
} else {
- if (orig_start >= orig_inode->i_size) {
+ orig_blocks = (orig_inode->i_size + blocksize - 1) >> blkbits;
+ if (orig_start >= orig_blocks) {
ext4_debug("ext4 move extent: start offset [%llu] "
- "should be less than original file size "
- "[%lld] [inode:orig %lu, donor %lu]\n",
- orig_start, orig_inode->i_size,
+ "should be less than original file blocks "
+ "[%u] [ino:orig %lu, donor %lu]\n",
+ orig_start, orig_blocks,
orig_inode->i_ino, donor_inode->i_ino);
return -EINVAL;
}
- if (orig_start + *len > orig_inode->i_size) {
+ if (orig_start + *len > orig_blocks) {
ext4_debug("ext4 move extent: Adjust length "
- "from %llu to %lld. Because it should be "
- "less than original file size "
+ "from %llu to %llu. Because it should be "
+ "less than original file blocks "
"[ino:orig %lu, donor %lu]\n",
- *len, orig_inode->i_size - orig_start,
+ *len, orig_blocks - orig_start,
orig_inode->i_ino, donor_inode->i_ino);
- *len = orig_inode->i_size - orig_start;
+ *len = orig_blocks - orig_start;
}
}
@@ -1027,18 +1111,23 @@ mext_check_arguments(struct inode *orig_inode,
* @inode1: the inode structure
* @inode2: the inode structure
*
- * Lock two inodes' i_mutex by i_ino order. This function is moved from
- * fs/inode.c.
+ * Lock two inodes' i_mutex by i_ino order.
+ * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0.
*/
-static void
+static int
mext_inode_double_lock(struct inode *inode1, struct inode *inode2)
{
- if (inode1 == NULL || inode2 == NULL || inode1 == inode2) {
- if (inode1)
- mutex_lock(&inode1->i_mutex);
- else if (inode2)
- mutex_lock(&inode2->i_mutex);
- return;
+ int ret = 0;
+
+ BUG_ON(inode1 == NULL && inode2 == NULL);
+
+ ret = mext_check_null_inode(inode1, inode2, __func__);
+ if (ret < 0)
+ goto out;
+
+ if (inode1 == inode2) {
+ mutex_lock(&inode1->i_mutex);
+ goto out;
}
if (inode1->i_ino < inode2->i_ino) {
@@ -1048,6 +1137,9 @@ mext_inode_double_lock(struct inode *inode1, struct inode *inode2)
mutex_lock_nested(&inode2->i_mutex, I_MUTEX_PARENT);
mutex_lock_nested(&inode1->i_mutex, I_MUTEX_CHILD);
}
+
+out:
+ return ret;
}
/**
@@ -1056,17 +1148,28 @@ mext_inode_double_lock(struct inode *inode1, struct inode *inode2)
* @inode1: the inode that is released first
* @inode2: the inode that is released second
*
- * This function is moved from fs/inode.c.
+ * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0.
*/
-static void
+static int
mext_inode_double_unlock(struct inode *inode1, struct inode *inode2)
{
+ int ret = 0;
+
+ BUG_ON(inode1 == NULL && inode2 == NULL);
+
+ ret = mext_check_null_inode(inode1, inode2, __func__);
+ if (ret < 0)
+ goto out;
+
if (inode1)
mutex_unlock(&inode1->i_mutex);
if (inode2 && inode2 != inode1)
mutex_unlock(&inode2->i_mutex);
+
+out:
+ return ret;
}
/**
@@ -1123,70 +1226,76 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
ext4_lblk_t block_end, seq_start, add_blocks, file_end, seq_blocks = 0;
ext4_lblk_t rest_blocks;
pgoff_t orig_page_offset = 0, seq_end_page;
- int ret, depth, last_extent = 0;
+ int ret1, ret2, depth, last_extent = 0;
int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
int data_offset_in_page;
int block_len_in_page;
int uninit;
/* protect orig and donor against a truncate */
- mext_inode_double_lock(orig_inode, donor_inode);
+ ret1 = mext_inode_double_lock(orig_inode, donor_inode);
+ if (ret1 < 0)
+ return ret1;
mext_double_down_read(orig_inode, donor_inode);
/* Check the filesystem environment whether move_extent can be done */
- ret = mext_check_arguments(orig_inode, donor_inode, orig_start,
+ ret1 = mext_check_arguments(orig_inode, donor_inode, orig_start,
donor_start, &len, *moved_len);
mext_double_up_read(orig_inode, donor_inode);
- if (ret)
- goto out2;
+ if (ret1)
+ goto out;
file_end = (i_size_read(orig_inode) - 1) >> orig_inode->i_blkbits;
block_end = block_start + len - 1;
if (file_end < block_end)
len -= block_end - file_end;
- get_ext_path(orig_path, orig_inode, block_start, ret);
- if (orig_path == NULL)
- goto out2;
+ ret1 = get_ext_path(orig_inode, block_start, &orig_path);
+ if (ret1)
+ goto out;
/* Get path structure to check the hole */
- get_ext_path(holecheck_path, orig_inode, block_start, ret);
- if (holecheck_path == NULL)
+ ret1 = get_ext_path(orig_inode, block_start, &holecheck_path);
+ if (ret1)
goto out;
depth = ext_depth(orig_inode);
ext_cur = holecheck_path[depth].p_ext;
- if (ext_cur == NULL) {
- ret = -EINVAL;
- goto out;
- }
/*
- * Get proper extent whose ee_block is beyond block_start
- * if block_start was within the hole.
+ * Get proper starting location of block replacement if block_start was
+ * within the hole.
*/
if (le32_to_cpu(ext_cur->ee_block) +
ext4_ext_get_actual_len(ext_cur) - 1 < block_start) {
+ /*
+ * The hole exists between extents or the tail of
+ * original file.
+ */
last_extent = mext_next_extent(orig_inode,
holecheck_path, &ext_cur);
if (last_extent < 0) {
- ret = last_extent;
+ ret1 = last_extent;
goto out;
}
last_extent = mext_next_extent(orig_inode, orig_path,
&ext_dummy);
if (last_extent < 0) {
- ret = last_extent;
+ ret1 = last_extent;
goto out;
}
- }
- seq_start = block_start;
+ seq_start = le32_to_cpu(ext_cur->ee_block);
+ } else if (le32_to_cpu(ext_cur->ee_block) > block_start)
+ /* The hole exists at the beginning of original file. */
+ seq_start = le32_to_cpu(ext_cur->ee_block);
+ else
+ seq_start = block_start;
/* No blocks within the specified range. */
if (le32_to_cpu(ext_cur->ee_block) > block_end) {
ext4_debug("ext4 move extent: The specified range of file "
"may be the hole\n");
- ret = -EINVAL;
+ ret1 = -EINVAL;
goto out;
}
@@ -1206,7 +1315,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
last_extent = mext_next_extent(orig_inode, holecheck_path,
&ext_cur);
if (last_extent < 0) {
- ret = last_extent;
+ ret1 = last_extent;
break;
}
add_blocks = ext4_ext_get_actual_len(ext_cur);
@@ -1258,16 +1367,23 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
while (orig_page_offset <= seq_end_page) {
/* Swap original branches with new branches */
- ret = move_extent_par_page(o_filp, donor_inode,
+ ret1 = move_extent_per_page(o_filp, donor_inode,
orig_page_offset,
data_offset_in_page,
block_len_in_page, uninit);
- if (ret < 0)
+ if (ret1 < 0)
goto out;
orig_page_offset++;
/* Count how many blocks we have exchanged */
*moved_len += block_len_in_page;
- BUG_ON(*moved_len > len);
+ if (*moved_len > len) {
+ ext4_error(orig_inode->i_sb, __func__,
+ "We replaced blocks too much! "
+ "sum of replaced: %llu requested: %llu",
+ *moved_len, len);
+ ret1 = -EIO;
+ goto out;
+ }
data_offset_in_page = 0;
rest_blocks -= block_len_in_page;
@@ -1280,17 +1396,16 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
/* Decrease buffer counter */
if (holecheck_path)
ext4_ext_drop_refs(holecheck_path);
- get_ext_path(holecheck_path, orig_inode,
- seq_start, ret);
- if (holecheck_path == NULL)
+ ret1 = get_ext_path(orig_inode, seq_start, &holecheck_path);
+ if (ret1)
break;
depth = holecheck_path->p_depth;
/* Decrease buffer counter */
if (orig_path)
ext4_ext_drop_refs(orig_path);
- get_ext_path(orig_path, orig_inode, seq_start, ret);
- if (orig_path == NULL)
+ ret1 = get_ext_path(orig_inode, seq_start, &orig_path);
+ if (ret1)
break;
ext_cur = holecheck_path[depth].p_ext;
@@ -1307,14 +1422,13 @@ out:
ext4_ext_drop_refs(holecheck_path);
kfree(holecheck_path);
}
-out2:
- mext_inode_double_unlock(orig_inode, donor_inode);
- if (ret)
- return ret;
+ ret2 = mext_inode_double_unlock(orig_inode, donor_inode);
- /* All of the specified blocks must be exchanged in succeed */
- BUG_ON(*moved_len != len);
+ if (ret1)
+ return ret1;
+ else if (ret2)
+ return ret2;
return 0;
}
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index de04013d16ff..42f81d285cd5 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1518,8 +1518,12 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
return retval;
if (blocks == 1 && !dx_fallback &&
- EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX))
- return make_indexed_dir(handle, dentry, inode, bh);
+ EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) {
+ retval = make_indexed_dir(handle, dentry, inode, bh);
+ if (retval == -ENOSPC)
+ brelse(bh);
+ return retval;
+ }
brelse(bh);
}
bh = ext4_append(handle, dir, &block, &retval);
@@ -1528,7 +1532,10 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
de = (struct ext4_dir_entry_2 *) bh->b_data;
de->inode = 0;
de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize);
- return add_dirent_to_buf(handle, dentry, inode, de, bh);
+ retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
+ if (retval == -ENOSPC)
+ brelse(bh);
+ return retval;
}
/*
@@ -1590,9 +1597,9 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
goto cleanup;
node2 = (struct dx_node *)(bh2->b_data);
entries2 = node2->entries;
+ memset(&node2->fake, 0, sizeof(struct fake_dirent));
node2->fake.rec_len = ext4_rec_len_to_disk(sb->s_blocksize,
sb->s_blocksize);
- node2->fake.inode = 0;
BUFFER_TRACE(frame->bh, "get_write_access");
err = ext4_journal_get_write_access(handle, frame->bh);
if (err)
@@ -1657,7 +1664,8 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
if (!de)
goto cleanup;
err = add_dirent_to_buf(handle, dentry, inode, de, bh);
- bh = NULL;
+ if (err != -ENOSPC)
+ bh = NULL;
goto cleanup;
journal_error:
@@ -2310,7 +2318,7 @@ static int ext4_link(struct dentry *old_dentry,
struct inode *inode = old_dentry->d_inode;
int err, retries = 0;
- if (EXT4_DIR_LINK_MAX(inode))
+ if (inode->i_nlink >= EXT4_LINK_MAX)
return -EMLINK;
/*
@@ -2413,7 +2421,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
goto end_rename;
retval = -EMLINK;
if (!new_inode && new_dir != old_dir &&
- new_dir->i_nlink >= EXT4_LINK_MAX)
+ EXT4_DIR_LINK_MAX(new_dir))
goto end_rename;
}
if (!new_bh) {
@@ -2536,7 +2544,7 @@ const struct inode_operations ext4_dir_inode_operations = {
.listxattr = ext4_listxattr,
.removexattr = generic_removexattr,
#endif
- .permission = ext4_permission,
+ .check_acl = ext4_check_acl,
.fiemap = ext4_fiemap,
};
@@ -2548,5 +2556,5 @@ const struct inode_operations ext4_special_inode_operations = {
.listxattr = ext4_listxattr,
.removexattr = generic_removexattr,
#endif
- .permission = ext4_permission,
+ .check_acl = ext4_check_acl,
};
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 68b0351fc647..3cfc343c41b5 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -746,7 +746,6 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
struct inode *inode = NULL;
handle_t *handle;
int gdb_off, gdb_num;
- int num_grp_locked = 0;
int err, err2;
gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb);
@@ -856,7 +855,6 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
* using the new disk blocks.
*/
- num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, input->group);
/* Update group descriptor block for new group */
gdp = (struct ext4_group_desc *)((char *)primary->b_data +
gdb_off * EXT4_DESC_SIZE(sb));
@@ -875,10 +873,8 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
* descriptor
*/
err = ext4_mb_add_groupinfo(sb, input->group, gdp);
- if (err) {
- ext4_mb_put_buddy_cache_lock(sb, input->group, num_grp_locked);
+ if (err)
goto exit_journal;
- }
/*
* Make the new blocks and inodes valid next. We do this before
@@ -920,7 +916,6 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
/* Update the global fs size fields */
sbi->s_groups_count++;
- ext4_mb_put_buddy_cache_lock(sb, input->group, num_grp_locked);
ext4_handle_dirty_metadata(handle, NULL, primary);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 8f4f079e6b9a..df539ba27779 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -45,6 +45,7 @@
#include "ext4_jbd2.h"
#include "xattr.h"
#include "acl.h"
+#include "mballoc.h"
#define CREATE_TRACE_POINTS
#include <trace/events/ext4.h>
@@ -344,7 +345,8 @@ static const char *ext4_decode_error(struct super_block *sb, int errno,
errstr = "Out of memory";
break;
case -EROFS:
- if (!sb || EXT4_SB(sb)->s_journal->j_flags & JBD2_ABORT)
+ if (!sb || (EXT4_SB(sb)->s_journal &&
+ EXT4_SB(sb)->s_journal->j_flags & JBD2_ABORT))
errstr = "Journal has aborted";
else
errstr = "Readonly filesystem";
@@ -962,7 +964,7 @@ static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
static ssize_t ext4_quota_write(struct super_block *sb, int type,
const char *data, size_t len, loff_t off);
-static struct dquot_operations ext4_quota_operations = {
+static const struct dquot_operations ext4_quota_operations = {
.initialize = dquot_initialize,
.drop = dquot_drop,
.alloc_space = dquot_alloc_space,
@@ -983,7 +985,7 @@ static struct dquot_operations ext4_quota_operations = {
.destroy_dquot = dquot_destroy,
};
-static struct quotactl_ops ext4_qctl_operations = {
+static const struct quotactl_ops ext4_qctl_operations = {
.quota_on = ext4_quota_on,
.quota_off = vfs_quota_off,
.quota_sync = vfs_quota_sync,
@@ -1279,11 +1281,9 @@ static int parse_options(char *options, struct super_block *sb,
*journal_devnum = option;
break;
case Opt_journal_checksum:
- set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM);
- break;
+ break; /* Kept for backwards compatibility */
case Opt_journal_async_commit:
set_opt(sbi->s_mount_opt, JOURNAL_ASYNC_COMMIT);
- set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM);
break;
case Opt_noload:
set_opt(sbi->s_mount_opt, NOLOAD);
@@ -1695,12 +1695,12 @@ static int ext4_fill_flex_info(struct super_block *sb)
gdp = ext4_get_group_desc(sb, i, NULL);
flex_group = ext4_flex_group(sbi, i);
- atomic_set(&sbi->s_flex_groups[flex_group].free_inodes,
- ext4_free_inodes_count(sb, gdp));
- atomic_set(&sbi->s_flex_groups[flex_group].free_blocks,
- ext4_free_blks_count(sb, gdp));
- atomic_set(&sbi->s_flex_groups[flex_group].used_dirs,
- ext4_used_dirs_count(sb, gdp));
+ atomic_add(ext4_free_inodes_count(sb, gdp),
+ &sbi->s_flex_groups[flex_group].free_inodes);
+ atomic_add(ext4_free_blks_count(sb, gdp),
+ &sbi->s_flex_groups[flex_group].free_blocks);
+ atomic_add(ext4_used_dirs_count(sb, gdp),
+ &sbi->s_flex_groups[flex_group].used_dirs);
}
return 1;
@@ -2253,6 +2253,49 @@ static struct kobj_type ext4_ktype = {
.release = ext4_sb_release,
};
+/*
+ * Check whether this filesystem can be mounted based on
+ * the features present and the RDONLY/RDWR mount requested.
+ * Returns 1 if this filesystem can be mounted as requested,
+ * 0 if it cannot be.
+ */
+static int ext4_feature_set_ok(struct super_block *sb, int readonly)
+{
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP)) {
+ ext4_msg(sb, KERN_ERR,
+ "Couldn't mount because of "
+ "unsupported optional features (%x)",
+ (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) &
+ ~EXT4_FEATURE_INCOMPAT_SUPP));
+ return 0;
+ }
+
+ if (readonly)
+ return 1;
+
+ /* Check that feature set is OK for a read-write mount */
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP)) {
+ ext4_msg(sb, KERN_ERR, "couldn't mount RDWR because of "
+ "unsupported optional features (%x)",
+ (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) &
+ ~EXT4_FEATURE_RO_COMPAT_SUPP));
+ return 0;
+ }
+ /*
+ * Large file size enabled file system can only be mounted
+ * read-write on 32-bit systems if kernel is built with CONFIG_LBDAF
+ */
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) {
+ if (sizeof(blkcnt_t) < sizeof(u64)) {
+ ext4_msg(sb, KERN_ERR, "Filesystem with huge files "
+ "cannot be mounted RDWR without "
+ "CONFIG_LBDAF");
+ return 0;
+ }
+ }
+ return 1;
+}
+
static int ext4_fill_super(struct super_block *sb, void *data, int silent)
__releases(kernel_lock)
__acquires(kernel_lock)
@@ -2274,7 +2317,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
unsigned int db_count;
unsigned int i;
int needs_recovery, has_huge_files;
- int features;
__u64 blocks_count;
int err;
unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
@@ -2401,39 +2443,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
* previously didn't change the revision level when setting the flags,
* so there is a chance incompat flags are set on a rev 0 filesystem.
*/
- features = EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP);
- if (features) {
- ext4_msg(sb, KERN_ERR,
- "Couldn't mount because of "
- "unsupported optional features (%x)",
- (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) &
- ~EXT4_FEATURE_INCOMPAT_SUPP));
- goto failed_mount;
- }
- features = EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP);
- if (!(sb->s_flags & MS_RDONLY) && features) {
- ext4_msg(sb, KERN_ERR,
- "Couldn't mount RDWR because of "
- "unsupported optional features (%x)",
- (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) &
- ~EXT4_FEATURE_RO_COMPAT_SUPP));
+ if (!ext4_feature_set_ok(sb, (sb->s_flags & MS_RDONLY)))
goto failed_mount;
- }
- has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
- if (has_huge_files) {
- /*
- * Large file size enabled file system can only be
- * mount if kernel is build with CONFIG_LBDAF
- */
- if (sizeof(root->i_blocks) < sizeof(u64) &&
- !(sb->s_flags & MS_RDONLY)) {
- ext4_msg(sb, KERN_ERR, "Filesystem with huge "
- "files cannot be mounted read-write "
- "without CONFIG_LBDAF");
- goto failed_mount;
- }
- }
+
blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
if (blocksize < EXT4_MIN_BLOCK_SIZE ||
@@ -2469,6 +2481,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
}
}
+ has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits,
has_huge_files);
sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files);
@@ -2549,12 +2563,19 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
goto failed_mount;
}
- if (ext4_blocks_count(es) >
- (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
+ /*
+ * Test whether we have more sectors than will fit in sector_t,
+ * and whether the max offset is addressable by the page cache.
+ */
+ if ((ext4_blocks_count(es) >
+ (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) ||
+ (ext4_blocks_count(es) >
+ (pgoff_t)(~0ULL) >> (PAGE_CACHE_SHIFT - sb->s_blocksize_bits))) {
ext4_msg(sb, KERN_ERR, "filesystem"
- " too large to mount safely");
+ " too large to mount safely on this system");
if (sizeof(sector_t) < 8)
ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled");
+ ret = -EFBIG;
goto failed_mount;
}
@@ -2595,6 +2616,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
goto failed_mount;
}
sbi->s_groups_count = blocks_count;
+ sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count,
+ (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb)));
db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
EXT4_DESC_PER_BLOCK(sb);
sbi->s_group_desc = kmalloc(db_count * sizeof(struct buffer_head *),
@@ -2729,20 +2752,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
goto failed_mount4;
}
- if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
- jbd2_journal_set_features(sbi->s_journal,
- JBD2_FEATURE_COMPAT_CHECKSUM, 0,
+ jbd2_journal_set_features(sbi->s_journal,
+ JBD2_FEATURE_COMPAT_CHECKSUM, 0, 0);
+ if (test_opt(sb, JOURNAL_ASYNC_COMMIT))
+ jbd2_journal_set_features(sbi->s_journal, 0, 0,
JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
- } else if (test_opt(sb, JOURNAL_CHECKSUM)) {
- jbd2_journal_set_features(sbi->s_journal,
- JBD2_FEATURE_COMPAT_CHECKSUM, 0, 0);
+ else
jbd2_journal_clear_features(sbi->s_journal, 0, 0,
JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
- } else {
- jbd2_journal_clear_features(sbi->s_journal,
- JBD2_FEATURE_COMPAT_CHECKSUM, 0,
- JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
- }
/* We have now updated the journal if required, so we can
* validate the data journaling mode. */
@@ -3208,7 +3225,18 @@ static int ext4_commit_super(struct super_block *sb, int sync)
clear_buffer_write_io_error(sbh);
set_buffer_uptodate(sbh);
}
- es->s_wtime = cpu_to_le32(get_seconds());
+ /*
+ * If the file system is mounted read-only, don't update the
+ * superblock write time. This avoids updating the superblock
+ * write time when we are mounting the root file system
+ * read/only but we need to replay the journal; at that point,
+ * for people who are east of GMT and who make their clock
+ * tick in localtime for Windows bug-for-bug compatibility,
+ * the clock is set in the future, and this will cause e2fsck
+ * to complain and force a full file system check.
+ */
+ if (!(sb->s_flags & MS_RDONLY))
+ es->s_wtime = cpu_to_le32(get_seconds());
es->s_kbytes_written =
cpu_to_le64(EXT4_SB(sb)->s_kbytes_written +
((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
@@ -3477,18 +3505,11 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
if (sbi->s_journal)
ext4_mark_recovery_complete(sb, es);
} else {
- int ret;
- if ((ret = EXT4_HAS_RO_COMPAT_FEATURE(sb,
- ~EXT4_FEATURE_RO_COMPAT_SUPP))) {
- ext4_msg(sb, KERN_WARNING, "couldn't "
- "remount RDWR because of unsupported "
- "optional features (%x)",
- (le32_to_cpu(sbi->s_es->s_feature_ro_compat) &
- ~EXT4_FEATURE_RO_COMPAT_SUPP));
+ /* Make sure we can mount this feature set readwrite */
+ if (!ext4_feature_set_ok(sb, 0)) {
err = -EROFS;
goto restore_opts;
}
-
/*
* Make sure the group descriptor checksums
* are sane. If they aren't, refuse to remount r/w.
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 62b31c246994..fed5b01d7a8d 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -810,12 +810,23 @@ inserted:
get_bh(new_bh);
} else {
/* We need to allocate a new block */
- ext4_fsblk_t goal = ext4_group_first_block_no(sb,
+ ext4_fsblk_t goal, block;
+
+ goal = ext4_group_first_block_no(sb,
EXT4_I(inode)->i_block_group);
- ext4_fsblk_t block = ext4_new_meta_blocks(handle, inode,
+
+ /* non-extent files can't have physical blocks past 2^32 */
+ if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+ goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
+
+ block = ext4_new_meta_blocks(handle, inode,
goal, NULL, &error);
if (error)
goto cleanup;
+
+ if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+ BUG_ON(block > EXT4_MAX_BLOCK_FILE_PHYS);
+
ea_idebug(inode, "creating block %d", block);
new_bh = sb_getblk(sb, block);
diff --git a/fs/fat/file.c b/fs/fat/file.c
index f042b965c95c..e8c159de236b 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -176,8 +176,26 @@ static int fat_cont_expand(struct inode *inode, loff_t size)
inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC;
mark_inode_dirty(inode);
- if (IS_SYNC(inode))
- err = sync_page_range_nolock(inode, mapping, start, count);
+ if (IS_SYNC(inode)) {
+ int err2;
+
+ /*
+ * Opencode syncing since we don't have a file open to use
+ * standard fsync path.
+ */
+ err = filemap_fdatawrite_range(mapping, start,
+ start + count - 1);
+ err2 = sync_mapping_buffers(mapping);
+ if (!err)
+ err = err2;
+ err2 = write_inode_now(inode, 1);
+ if (!err)
+ err = err2;
+ if (!err) {
+ err = filemap_fdatawait_range(mapping, start,
+ start + count - 1);
+ }
+ }
out:
return err;
}
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index a6c20473dfd7..4e35be873e09 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -119,8 +119,8 @@ int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster)
MSDOS_I(inode)->i_start = new_dclus;
MSDOS_I(inode)->i_logstart = new_dclus;
/*
- * Since generic_osync_inode() synchronize later if
- * this is not directory, we don't here.
+ * Since generic_write_sync() synchronizes regular files later,
+ * we sync here only directories.
*/
if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode)) {
ret = fat_sync_inode(inode);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index c54226be5294..8e1e5e19d21e 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -19,171 +19,245 @@
#include <linux/sched.h>
#include <linux/fs.h>
#include <linux/mm.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
#include <linux/writeback.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/buffer_head.h>
#include "internal.h"
+#define inode_to_bdi(inode) ((inode)->i_mapping->backing_dev_info)
-/**
- * writeback_acquire - attempt to get exclusive writeback access to a device
- * @bdi: the device's backing_dev_info structure
- *
- * It is a waste of resources to have more than one pdflush thread blocked on
- * a single request queue. Exclusion at the request_queue level is obtained
- * via a flag in the request_queue's backing_dev_info.state.
- *
- * Non-request_queue-backed address_spaces will share default_backing_dev_info,
- * unless they implement their own. Which is somewhat inefficient, as this
- * may prevent concurrent writeback against multiple devices.
+/*
+ * We don't actually have pdflush, but this one is exported though /proc...
+ */
+int nr_pdflush_threads;
+
+/*
+ * Passed into wb_writeback(), essentially a subset of writeback_control
+ */
+struct wb_writeback_args {
+ long nr_pages;
+ struct super_block *sb;
+ enum writeback_sync_modes sync_mode;
+ int for_kupdate;
+ int range_cyclic;
+};
+
+/*
+ * Work items for the bdi_writeback threads
*/
-static int writeback_acquire(struct backing_dev_info *bdi)
+struct bdi_work {
+ struct list_head list; /* pending work list */
+ struct rcu_head rcu_head; /* for RCU free/clear of work */
+
+ unsigned long seen; /* threads that have seen this work */
+ atomic_t pending; /* number of threads still to do work */
+
+ struct wb_writeback_args args; /* writeback arguments */
+
+ unsigned long state; /* flag bits, see WS_* */
+};
+
+enum {
+ WS_USED_B = 0,
+ WS_ONSTACK_B,
+};
+
+#define WS_USED (1 << WS_USED_B)
+#define WS_ONSTACK (1 << WS_ONSTACK_B)
+
+static inline bool bdi_work_on_stack(struct bdi_work *work)
+{
+ return test_bit(WS_ONSTACK_B, &work->state);
+}
+
+static inline void bdi_work_init(struct bdi_work *work,
+ struct wb_writeback_args *args)
{
- return !test_and_set_bit(BDI_pdflush, &bdi->state);
+ INIT_RCU_HEAD(&work->rcu_head);
+ work->args = *args;
+ work->state = WS_USED;
}
/**
* writeback_in_progress - determine whether there is writeback in progress
* @bdi: the device's backing_dev_info structure.
*
- * Determine whether there is writeback in progress against a backing device.
+ * Determine whether there is writeback waiting to be handled against a
+ * backing device.
*/
int writeback_in_progress(struct backing_dev_info *bdi)
{
- return test_bit(BDI_pdflush, &bdi->state);
+ return !list_empty(&bdi->work_list);
}
-/**
- * writeback_release - relinquish exclusive writeback access against a device.
- * @bdi: the device's backing_dev_info structure
- */
-static void writeback_release(struct backing_dev_info *bdi)
+static void bdi_work_clear(struct bdi_work *work)
{
- BUG_ON(!writeback_in_progress(bdi));
- clear_bit(BDI_pdflush, &bdi->state);
+ clear_bit(WS_USED_B, &work->state);
+ smp_mb__after_clear_bit();
+ /*
+ * work can have disappeared at this point. bit waitq functions
+ * should be able to tolerate this, provided bdi_sched_wait does
+ * not dereference it's pointer argument.
+ */
+ wake_up_bit(&work->state, WS_USED_B);
}
-static noinline void block_dump___mark_inode_dirty(struct inode *inode)
+static void bdi_work_free(struct rcu_head *head)
{
- if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) {
- struct dentry *dentry;
- const char *name = "?";
+ struct bdi_work *work = container_of(head, struct bdi_work, rcu_head);
- dentry = d_find_alias(inode);
- if (dentry) {
- spin_lock(&dentry->d_lock);
- name = (const char *) dentry->d_name.name;
- }
- printk(KERN_DEBUG
- "%s(%d): dirtied inode %lu (%s) on %s\n",
- current->comm, task_pid_nr(current), inode->i_ino,
- name, inode->i_sb->s_id);
- if (dentry) {
- spin_unlock(&dentry->d_lock);
- dput(dentry);
- }
- }
+ if (!bdi_work_on_stack(work))
+ kfree(work);
+ else
+ bdi_work_clear(work);
}
-/**
- * __mark_inode_dirty - internal function
- * @inode: inode to mark
- * @flags: what kind of dirty (i.e. I_DIRTY_SYNC)
- * Mark an inode as dirty. Callers should use mark_inode_dirty or
- * mark_inode_dirty_sync.
- *
- * Put the inode on the super block's dirty list.
- *
- * CAREFUL! We mark it dirty unconditionally, but move it onto the
- * dirty list only if it is hashed or if it refers to a blockdev.
- * If it was not hashed, it will never be added to the dirty list
- * even if it is later hashed, as it will have been marked dirty already.
- *
- * In short, make sure you hash any inodes _before_ you start marking
- * them dirty.
- *
- * This function *must* be atomic for the I_DIRTY_PAGES case -
- * set_page_dirty() is called under spinlock in several places.
- *
- * Note that for blockdevs, inode->dirtied_when represents the dirtying time of
- * the block-special inode (/dev/hda1) itself. And the ->dirtied_when field of
- * the kernel-internal blockdev inode represents the dirtying time of the
- * blockdev's pages. This is why for I_DIRTY_PAGES we always use
- * page->mapping->host, so the page-dirtying time is recorded in the internal
- * blockdev inode.
- */
-void __mark_inode_dirty(struct inode *inode, int flags)
+static void wb_work_complete(struct bdi_work *work)
{
- struct super_block *sb = inode->i_sb;
+ const enum writeback_sync_modes sync_mode = work->args.sync_mode;
+ int onstack = bdi_work_on_stack(work);
/*
- * Don't do this for I_DIRTY_PAGES - that doesn't actually
- * dirty the inode itself
+ * For allocated work, we can clear the done/seen bit right here.
+ * For on-stack work, we need to postpone both the clear and free
+ * to after the RCU grace period, since the stack could be invalidated
+ * as soon as bdi_work_clear() has done the wakeup.
*/
- if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
- if (sb->s_op->dirty_inode)
- sb->s_op->dirty_inode(inode);
- }
+ if (!onstack)
+ bdi_work_clear(work);
+ if (sync_mode == WB_SYNC_NONE || onstack)
+ call_rcu(&work->rcu_head, bdi_work_free);
+}
+static void wb_clear_pending(struct bdi_writeback *wb, struct bdi_work *work)
+{
/*
- * make sure that changes are seen by all cpus before we test i_state
- * -- mikulas
+ * The caller has retrieved the work arguments from this work,
+ * drop our reference. If this is the last ref, delete and free it
*/
- smp_mb();
+ if (atomic_dec_and_test(&work->pending)) {
+ struct backing_dev_info *bdi = wb->bdi;
- /* avoid the locking if we can */
- if ((inode->i_state & flags) == flags)
- return;
+ spin_lock(&bdi->wb_lock);
+ list_del_rcu(&work->list);
+ spin_unlock(&bdi->wb_lock);
- if (unlikely(block_dump))
- block_dump___mark_inode_dirty(inode);
+ wb_work_complete(work);
+ }
+}
- spin_lock(&inode_lock);
- if ((inode->i_state & flags) != flags) {
- const int was_dirty = inode->i_state & I_DIRTY;
+static void bdi_queue_work(struct backing_dev_info *bdi, struct bdi_work *work)
+{
+ work->seen = bdi->wb_mask;
+ BUG_ON(!work->seen);
+ atomic_set(&work->pending, bdi->wb_cnt);
+ BUG_ON(!bdi->wb_cnt);
- inode->i_state |= flags;
+ /*
+ * list_add_tail_rcu() contains the necessary barriers to
+ * make sure the above stores are seen before the item is
+ * noticed on the list
+ */
+ spin_lock(&bdi->wb_lock);
+ list_add_tail_rcu(&work->list, &bdi->work_list);
+ spin_unlock(&bdi->wb_lock);
- /*
- * If the inode is being synced, just update its dirty state.
- * The unlocker will place the inode on the appropriate
- * superblock list, based upon its state.
- */
- if (inode->i_state & I_SYNC)
- goto out;
+ /*
+ * If the default thread isn't there, make sure we add it. When
+ * it gets created and wakes up, we'll run this work.
+ */
+ if (unlikely(list_empty_careful(&bdi->wb_list)))
+ wake_up_process(default_backing_dev_info.wb.task);
+ else {
+ struct bdi_writeback *wb = &bdi->wb;
- /*
- * Only add valid (hashed) inodes to the superblock's
- * dirty list. Add blockdev inodes as well.
- */
- if (!S_ISBLK(inode->i_mode)) {
- if (hlist_unhashed(&inode->i_hash))
- goto out;
- }
- if (inode->i_state & (I_FREEING|I_CLEAR))
- goto out;
+ if (wb->task)
+ wake_up_process(wb->task);
+ }
+}
- /*
- * If the inode was already on s_dirty/s_io/s_more_io, don't
- * reposition it (that would break s_dirty time-ordering).
- */
- if (!was_dirty) {
- inode->dirtied_when = jiffies;
- list_move(&inode->i_list, &sb->s_dirty);
- }
+/*
+ * Used for on-stack allocated work items. The caller needs to wait until
+ * the wb threads have acked the work before it's safe to continue.
+ */
+static void bdi_wait_on_work_clear(struct bdi_work *work)
+{
+ wait_on_bit(&work->state, WS_USED_B, bdi_sched_wait,
+ TASK_UNINTERRUPTIBLE);
+}
+
+static void bdi_alloc_queue_work(struct backing_dev_info *bdi,
+ struct wb_writeback_args *args)
+{
+ struct bdi_work *work;
+
+ /*
+ * This is WB_SYNC_NONE writeback, so if allocation fails just
+ * wakeup the thread for old dirty data writeback
+ */
+ work = kmalloc(sizeof(*work), GFP_ATOMIC);
+ if (work) {
+ bdi_work_init(work, args);
+ bdi_queue_work(bdi, work);
+ } else {
+ struct bdi_writeback *wb = &bdi->wb;
+
+ if (wb->task)
+ wake_up_process(wb->task);
}
-out:
- spin_unlock(&inode_lock);
}
-EXPORT_SYMBOL(__mark_inode_dirty);
+/**
+ * bdi_sync_writeback - start and wait for writeback
+ * @bdi: the backing device to write from
+ * @sb: write inodes from this super_block
+ *
+ * Description:
+ * This does WB_SYNC_ALL data integrity writeback and waits for the
+ * IO to complete. Callers must hold the sb s_umount semaphore for
+ * reading, to avoid having the super disappear before we are done.
+ */
+static void bdi_sync_writeback(struct backing_dev_info *bdi,
+ struct super_block *sb)
+{
+ struct wb_writeback_args args = {
+ .sb = sb,
+ .sync_mode = WB_SYNC_ALL,
+ .nr_pages = LONG_MAX,
+ .range_cyclic = 0,
+ };
+ struct bdi_work work;
-static int write_inode(struct inode *inode, int sync)
+ bdi_work_init(&work, &args);
+ work.state |= WS_ONSTACK;
+
+ bdi_queue_work(bdi, &work);
+ bdi_wait_on_work_clear(&work);
+}
+
+/**
+ * bdi_start_writeback - start writeback
+ * @bdi: the backing device to write from
+ * @nr_pages: the number of pages to write
+ *
+ * Description:
+ * This does WB_SYNC_NONE opportunistic writeback. The IO is only
+ * started when this function returns, we make no guarentees on
+ * completion. Caller need not hold sb s_umount semaphore.
+ *
+ */
+void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages)
{
- if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode))
- return inode->i_sb->s_op->write_inode(inode, sync);
- return 0;
+ struct wb_writeback_args args = {
+ .sync_mode = WB_SYNC_NONE,
+ .nr_pages = nr_pages,
+ .range_cyclic = 1,
+ };
+
+ bdi_alloc_queue_work(bdi, &args);
}
/*
@@ -191,31 +265,32 @@ static int write_inode(struct inode *inode, int sync)
* furthest end of its superblock's dirty-inode list.
*
* Before stamping the inode's ->dirtied_when, we check to see whether it is
- * already the most-recently-dirtied inode on the s_dirty list. If that is
+ * already the most-recently-dirtied inode on the b_dirty list. If that is
* the case then the inode must have been redirtied while it was being written
* out and we don't reset its dirtied_when.
*/
static void redirty_tail(struct inode *inode)
{
- struct super_block *sb = inode->i_sb;
+ struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
- if (!list_empty(&sb->s_dirty)) {
- struct inode *tail_inode;
+ if (!list_empty(&wb->b_dirty)) {
+ struct inode *tail;
- tail_inode = list_entry(sb->s_dirty.next, struct inode, i_list);
- if (time_before(inode->dirtied_when,
- tail_inode->dirtied_when))
+ tail = list_entry(wb->b_dirty.next, struct inode, i_list);
+ if (time_before(inode->dirtied_when, tail->dirtied_when))
inode->dirtied_when = jiffies;
}
- list_move(&inode->i_list, &sb->s_dirty);
+ list_move(&inode->i_list, &wb->b_dirty);
}
/*
- * requeue inode for re-scanning after sb->s_io list is exhausted.
+ * requeue inode for re-scanning after bdi->b_io list is exhausted.
*/
static void requeue_io(struct inode *inode)
{
- list_move(&inode->i_list, &inode->i_sb->s_more_io);
+ struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
+
+ list_move(&inode->i_list, &wb->b_more_io);
}
static void inode_sync_complete(struct inode *inode)
@@ -262,20 +337,18 @@ static void move_expired_inodes(struct list_head *delaying_queue,
/*
* Queue all expired dirty inodes for io, eldest first.
*/
-static void queue_io(struct super_block *sb,
- unsigned long *older_than_this)
+static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this)
{
- list_splice_init(&sb->s_more_io, sb->s_io.prev);
- move_expired_inodes(&sb->s_dirty, &sb->s_io, older_than_this);
+ list_splice_init(&wb->b_more_io, wb->b_io.prev);
+ move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this);
}
-int sb_has_dirty_inodes(struct super_block *sb)
+static int write_inode(struct inode *inode, int sync)
{
- return !list_empty(&sb->s_dirty) ||
- !list_empty(&sb->s_io) ||
- !list_empty(&sb->s_more_io);
+ if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode))
+ return inode->i_sb->s_op->write_inode(inode, sync);
+ return 0;
}
-EXPORT_SYMBOL(sb_has_dirty_inodes);
/*
* Wait for writeback on an inode to complete.
@@ -322,11 +395,11 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
if (inode->i_state & I_SYNC) {
/*
* If this inode is locked for writeback and we are not doing
- * writeback-for-data-integrity, move it to s_more_io so that
+ * writeback-for-data-integrity, move it to b_more_io so that
* writeback can proceed with the other inodes on s_io.
*
* We'll have another go at writing back this inode when we
- * completed a full scan of s_io.
+ * completed a full scan of b_io.
*/
if (!wait) {
requeue_io(inode);
@@ -371,11 +444,11 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
/*
* We didn't write back all the pages. nfs_writepages()
* sometimes bales out without doing anything. Redirty
- * the inode; Move it from s_io onto s_more_io/s_dirty.
+ * the inode; Move it from b_io onto b_more_io/b_dirty.
*/
/*
* akpm: if the caller was the kupdate function we put
- * this inode at the head of s_dirty so it gets first
+ * this inode at the head of b_dirty so it gets first
* consideration. Otherwise, move it to the tail, for
* the reasons described there. I'm not really sure
* how much sense this makes. Presumably I had a good
@@ -385,7 +458,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
if (wbc->for_kupdate) {
/*
* For the kupdate function we move the inode
- * to s_more_io so it will get more writeout as
+ * to b_more_io so it will get more writeout as
* soon as the queue becomes uncongested.
*/
inode->i_state |= I_DIRTY_PAGES;
@@ -434,50 +507,84 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
}
/*
- * Write out a superblock's list of dirty inodes. A wait will be performed
- * upon no inodes, all inodes or the final one, depending upon sync_mode.
- *
- * If older_than_this is non-NULL, then only write out inodes which
- * had their first dirtying at a time earlier than *older_than_this.
- *
- * If we're a pdflush thread, then implement pdflush collision avoidance
- * against the entire list.
+ * For WB_SYNC_NONE writeback, the caller does not have the sb pinned
+ * before calling writeback. So make sure that we do pin it, so it doesn't
+ * go away while we are writing inodes from it.
*
- * If `bdi' is non-zero then we're being asked to writeback a specific queue.
- * This function assumes that the blockdev superblock's inodes are backed by
- * a variety of queues, so all inodes are searched. For other superblocks,
- * assume that all inodes are backed by the same queue.
- *
- * FIXME: this linear search could get expensive with many fileystems. But
- * how to fix? We need to go from an address_space to all inodes which share
- * a queue with that address_space. (Easy: have a global "dirty superblocks"
- * list).
- *
- * The inodes to be written are parked on sb->s_io. They are moved back onto
- * sb->s_dirty as they are selected for writing. This way, none can be missed
- * on the writer throttling path, and we get decent balancing between many
- * throttled threads: we don't want them all piling up on inode_sync_wait.
+ * Returns 0 if the super was successfully pinned (or pinning wasn't needed),
+ * 1 if we failed.
*/
-void generic_sync_sb_inodes(struct super_block *sb,
+static int pin_sb_for_writeback(struct writeback_control *wbc,
+ struct inode *inode)
+{
+ struct super_block *sb = inode->i_sb;
+
+ /*
+ * Caller must already hold the ref for this
+ */
+ if (wbc->sync_mode == WB_SYNC_ALL) {
+ WARN_ON(!rwsem_is_locked(&sb->s_umount));
+ return 0;
+ }
+
+ spin_lock(&sb_lock);
+ sb->s_count++;
+ if (down_read_trylock(&sb->s_umount)) {
+ if (sb->s_root) {
+ spin_unlock(&sb_lock);
+ return 0;
+ }
+ /*
+ * umounted, drop rwsem again and fall through to failure
+ */
+ up_read(&sb->s_umount);
+ }
+
+ sb->s_count--;
+ spin_unlock(&sb_lock);
+ return 1;
+}
+
+static void unpin_sb_for_writeback(struct writeback_control *wbc,
+ struct inode *inode)
+{
+ struct super_block *sb = inode->i_sb;
+
+ if (wbc->sync_mode == WB_SYNC_ALL)
+ return;
+
+ up_read(&sb->s_umount);
+ put_super(sb);
+}
+
+static void writeback_inodes_wb(struct bdi_writeback *wb,
struct writeback_control *wbc)
{
+ struct super_block *sb = wbc->sb;
+ const int is_blkdev_sb = sb_is_blkdev_sb(sb);
const unsigned long start = jiffies; /* livelock avoidance */
- int sync = wbc->sync_mode == WB_SYNC_ALL;
spin_lock(&inode_lock);
- if (!wbc->for_kupdate || list_empty(&sb->s_io))
- queue_io(sb, wbc->older_than_this);
- while (!list_empty(&sb->s_io)) {
- struct inode *inode = list_entry(sb->s_io.prev,
+ if (!wbc->for_kupdate || list_empty(&wb->b_io))
+ queue_io(wb, wbc->older_than_this);
+
+ while (!list_empty(&wb->b_io)) {
+ struct inode *inode = list_entry(wb->b_io.prev,
struct inode, i_list);
- struct address_space *mapping = inode->i_mapping;
- struct backing_dev_info *bdi = mapping->backing_dev_info;
long pages_skipped;
- if (!bdi_cap_writeback_dirty(bdi)) {
+ /*
+ * super block given and doesn't match, skip this inode
+ */
+ if (sb && sb != inode->i_sb) {
+ redirty_tail(inode);
+ continue;
+ }
+
+ if (!bdi_cap_writeback_dirty(wb->bdi)) {
redirty_tail(inode);
- if (sb_is_blkdev_sb(sb)) {
+ if (is_blkdev_sb) {
/*
* Dirty memory-backed blockdev: the ramdisk
* driver does this. Skip just this inode
@@ -497,21 +604,14 @@ void generic_sync_sb_inodes(struct super_block *sb,
continue;
}
- if (wbc->nonblocking && bdi_write_congested(bdi)) {
+ if (wbc->nonblocking && bdi_write_congested(wb->bdi)) {
wbc->encountered_congestion = 1;
- if (!sb_is_blkdev_sb(sb))
+ if (!is_blkdev_sb)
break; /* Skip a congested fs */
requeue_io(inode);
continue; /* Skip a congested blockdev */
}
- if (wbc->bdi && bdi != wbc->bdi) {
- if (!sb_is_blkdev_sb(sb))
- break; /* fs has the wrong queue */
- requeue_io(inode);
- continue; /* blockdev has wrong queue */
- }
-
/*
* Was this inode dirtied after sync_sb_inodes was called?
* This keeps sync from extra jobs and livelock.
@@ -519,16 +619,16 @@ void generic_sync_sb_inodes(struct super_block *sb,
if (inode_dirtied_after(inode, start))
break;
- /* Is another pdflush already flushing this queue? */
- if (current_is_pdflush() && !writeback_acquire(bdi))
- break;
+ if (pin_sb_for_writeback(wbc, inode)) {
+ requeue_io(inode);
+ continue;
+ }
BUG_ON(inode->i_state & (I_FREEING | I_CLEAR));
__iget(inode);
pages_skipped = wbc->pages_skipped;
writeback_single_inode(inode, wbc);
- if (current_is_pdflush())
- writeback_release(bdi);
+ unpin_sb_for_writeback(wbc, inode);
if (wbc->pages_skipped != pages_skipped) {
/*
* writeback is not making progress due to locked
@@ -544,144 +644,520 @@ void generic_sync_sb_inodes(struct super_block *sb,
wbc->more_io = 1;
break;
}
- if (!list_empty(&sb->s_more_io))
+ if (!list_empty(&wb->b_more_io))
wbc->more_io = 1;
}
- if (sync) {
- struct inode *inode, *old_inode = NULL;
+ spin_unlock(&inode_lock);
+ /* Leave any unwritten inodes on b_io */
+}
+
+void writeback_inodes_wbc(struct writeback_control *wbc)
+{
+ struct backing_dev_info *bdi = wbc->bdi;
+ writeback_inodes_wb(&bdi->wb, wbc);
+}
+
+/*
+ * The maximum number of pages to writeout in a single bdi flush/kupdate
+ * operation. We do this so we don't hold I_SYNC against an inode for
+ * enormous amounts of time, which would block a userspace task which has
+ * been forced to throttle against that inode. Also, the code reevaluates
+ * the dirty each time it has written this many pages.
+ */
+#define MAX_WRITEBACK_PAGES 1024
+
+static inline bool over_bground_thresh(void)
+{
+ unsigned long background_thresh, dirty_thresh;
+
+ get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
+
+ return (global_page_state(NR_FILE_DIRTY) +
+ global_page_state(NR_UNSTABLE_NFS) >= background_thresh);
+}
+
+/*
+ * Explicit flushing or periodic writeback of "old" data.
+ *
+ * Define "old": the first time one of an inode's pages is dirtied, we mark the
+ * dirtying-time in the inode's address_space. So this periodic writeback code
+ * just walks the superblock inode list, writing back any inodes which are
+ * older than a specific point in time.
+ *
+ * Try to run once per dirty_writeback_interval. But if a writeback event
+ * takes longer than a dirty_writeback_interval interval, then leave a
+ * one-second gap.
+ *
+ * older_than_this takes precedence over nr_to_write. So we'll only write back
+ * all dirty pages if they are all attached to "old" mappings.
+ */
+static long wb_writeback(struct bdi_writeback *wb,
+ struct wb_writeback_args *args)
+{
+ struct writeback_control wbc = {
+ .bdi = wb->bdi,
+ .sb = args->sb,
+ .sync_mode = args->sync_mode,
+ .older_than_this = NULL,
+ .for_kupdate = args->for_kupdate,
+ .range_cyclic = args->range_cyclic,
+ };
+ unsigned long oldest_jif;
+ long wrote = 0;
+
+ if (wbc.for_kupdate) {
+ wbc.older_than_this = &oldest_jif;
+ oldest_jif = jiffies -
+ msecs_to_jiffies(dirty_expire_interval * 10);
+ }
+ if (!wbc.range_cyclic) {
+ wbc.range_start = 0;
+ wbc.range_end = LLONG_MAX;
+ }
+
+ for (;;) {
/*
- * Data integrity sync. Must wait for all pages under writeback,
- * because there may have been pages dirtied before our sync
- * call, but which had writeout started before we write it out.
- * In which case, the inode may not be on the dirty list, but
- * we still have to wait for that writeout.
+ * Don't flush anything for non-integrity writeback where
+ * no nr_pages was given
*/
- list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
- struct address_space *mapping;
+ if (!args->for_kupdate && args->nr_pages <= 0 &&
+ args->sync_mode == WB_SYNC_NONE)
+ break;
- if (inode->i_state &
- (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
- continue;
- mapping = inode->i_mapping;
- if (mapping->nrpages == 0)
+ /*
+ * If no specific pages were given and this is just a
+ * periodic background writeout and we are below the
+ * background dirty threshold, don't do anything
+ */
+ if (args->for_kupdate && args->nr_pages <= 0 &&
+ !over_bground_thresh())
+ break;
+
+ wbc.more_io = 0;
+ wbc.encountered_congestion = 0;
+ wbc.nr_to_write = MAX_WRITEBACK_PAGES;
+ wbc.pages_skipped = 0;
+ writeback_inodes_wb(wb, &wbc);
+ args->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
+ wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write;
+
+ /*
+ * If we ran out of stuff to write, bail unless more_io got set
+ */
+ if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
+ if (wbc.more_io && !wbc.for_kupdate)
continue;
- __iget(inode);
- spin_unlock(&inode_lock);
- /*
- * We hold a reference to 'inode' so it couldn't have
- * been removed from s_inodes list while we dropped the
- * inode_lock. We cannot iput the inode now as we can
- * be holding the last reference and we cannot iput it
- * under inode_lock. So we keep the reference and iput
- * it later.
- */
- iput(old_inode);
- old_inode = inode;
+ break;
+ }
+ }
+
+ return wrote;
+}
+
+/*
+ * Return the next bdi_work struct that hasn't been processed by this
+ * wb thread yet. ->seen is initially set for each thread that exists
+ * for this device, when a thread first notices a piece of work it
+ * clears its bit. Depending on writeback type, the thread will notify
+ * completion on either receiving the work (WB_SYNC_NONE) or after
+ * it is done (WB_SYNC_ALL).
+ */
+static struct bdi_work *get_next_work_item(struct backing_dev_info *bdi,
+ struct bdi_writeback *wb)
+{
+ struct bdi_work *work, *ret = NULL;
+
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(work, &bdi->work_list, list) {
+ if (!test_bit(wb->nr, &work->seen))
+ continue;
+ clear_bit(wb->nr, &work->seen);
+
+ ret = work;
+ break;
+ }
+
+ rcu_read_unlock();
+ return ret;
+}
+
+static long wb_check_old_data_flush(struct bdi_writeback *wb)
+{
+ unsigned long expired;
+ long nr_pages;
+
+ expired = wb->last_old_flush +
+ msecs_to_jiffies(dirty_writeback_interval * 10);
+ if (time_before(jiffies, expired))
+ return 0;
+
+ wb->last_old_flush = jiffies;
+ nr_pages = global_page_state(NR_FILE_DIRTY) +
+ global_page_state(NR_UNSTABLE_NFS) +
+ (inodes_stat.nr_inodes - inodes_stat.nr_unused);
+
+ if (nr_pages) {
+ struct wb_writeback_args args = {
+ .nr_pages = nr_pages,
+ .sync_mode = WB_SYNC_NONE,
+ .for_kupdate = 1,
+ .range_cyclic = 1,
+ };
+
+ return wb_writeback(wb, &args);
+ }
+
+ return 0;
+}
+
+/*
+ * Retrieve work items and do the writeback they describe
+ */
+long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
+{
+ struct backing_dev_info *bdi = wb->bdi;
+ struct bdi_work *work;
+ long wrote = 0;
- filemap_fdatawait(mapping);
+ while ((work = get_next_work_item(bdi, wb)) != NULL) {
+ struct wb_writeback_args args = work->args;
- cond_resched();
+ /*
+ * Override sync mode, in case we must wait for completion
+ */
+ if (force_wait)
+ work->args.sync_mode = args.sync_mode = WB_SYNC_ALL;
- spin_lock(&inode_lock);
+ /*
+ * If this isn't a data integrity operation, just notify
+ * that we have seen this work and we are now starting it.
+ */
+ if (args.sync_mode == WB_SYNC_NONE)
+ wb_clear_pending(wb, work);
+
+ wrote += wb_writeback(wb, &args);
+
+ /*
+ * This is a data integrity writeback, so only do the
+ * notification when we have completed the work.
+ */
+ if (args.sync_mode == WB_SYNC_ALL)
+ wb_clear_pending(wb, work);
+ }
+
+ /*
+ * Check for periodic writeback, kupdated() style
+ */
+ wrote += wb_check_old_data_flush(wb);
+
+ return wrote;
+}
+
+/*
+ * Handle writeback of dirty data for the device backed by this bdi. Also
+ * wakes up periodically and does kupdated style flushing.
+ */
+int bdi_writeback_task(struct bdi_writeback *wb)
+{
+ unsigned long last_active = jiffies;
+ unsigned long wait_jiffies = -1UL;
+ long pages_written;
+
+ while (!kthread_should_stop()) {
+ pages_written = wb_do_writeback(wb, 0);
+
+ if (pages_written)
+ last_active = jiffies;
+ else if (wait_jiffies != -1UL) {
+ unsigned long max_idle;
+
+ /*
+ * Longest period of inactivity that we tolerate. If we
+ * see dirty data again later, the task will get
+ * recreated automatically.
+ */
+ max_idle = max(5UL * 60 * HZ, wait_jiffies);
+ if (time_after(jiffies, max_idle + last_active))
+ break;
}
- spin_unlock(&inode_lock);
- iput(old_inode);
- } else
- spin_unlock(&inode_lock);
- return; /* Leave any unwritten inodes on s_io */
+ wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10);
+ schedule_timeout_interruptible(wait_jiffies);
+ try_to_freeze();
+ }
+
+ return 0;
}
-EXPORT_SYMBOL_GPL(generic_sync_sb_inodes);
-static void sync_sb_inodes(struct super_block *sb,
- struct writeback_control *wbc)
+/*
+ * Schedule writeback for all backing devices. This does WB_SYNC_NONE
+ * writeback, for integrity writeback see bdi_sync_writeback().
+ */
+static void bdi_writeback_all(struct super_block *sb, long nr_pages)
{
- generic_sync_sb_inodes(sb, wbc);
+ struct wb_writeback_args args = {
+ .sb = sb,
+ .nr_pages = nr_pages,
+ .sync_mode = WB_SYNC_NONE,
+ };
+ struct backing_dev_info *bdi;
+
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
+ if (!bdi_has_dirty_io(bdi))
+ continue;
+
+ bdi_alloc_queue_work(bdi, &args);
+ }
+
+ rcu_read_unlock();
}
/*
- * Start writeback of dirty pagecache data against all unlocked inodes.
+ * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back
+ * the whole world.
+ */
+void wakeup_flusher_threads(long nr_pages)
+{
+ if (nr_pages == 0)
+ nr_pages = global_page_state(NR_FILE_DIRTY) +
+ global_page_state(NR_UNSTABLE_NFS);
+ bdi_writeback_all(NULL, nr_pages);
+}
+
+static noinline void block_dump___mark_inode_dirty(struct inode *inode)
+{
+ if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) {
+ struct dentry *dentry;
+ const char *name = "?";
+
+ dentry = d_find_alias(inode);
+ if (dentry) {
+ spin_lock(&dentry->d_lock);
+ name = (const char *) dentry->d_name.name;
+ }
+ printk(KERN_DEBUG
+ "%s(%d): dirtied inode %lu (%s) on %s\n",
+ current->comm, task_pid_nr(current), inode->i_ino,
+ name, inode->i_sb->s_id);
+ if (dentry) {
+ spin_unlock(&dentry->d_lock);
+ dput(dentry);
+ }
+ }
+}
+
+/**
+ * __mark_inode_dirty - internal function
+ * @inode: inode to mark
+ * @flags: what kind of dirty (i.e. I_DIRTY_SYNC)
+ * Mark an inode as dirty. Callers should use mark_inode_dirty or
+ * mark_inode_dirty_sync.
+ *
+ * Put the inode on the super block's dirty list.
*
- * Note:
- * We don't need to grab a reference to superblock here. If it has non-empty
- * ->s_dirty it's hadn't been killed yet and kill_super() won't proceed
- * past sync_inodes_sb() until the ->s_dirty/s_io/s_more_io lists are all
- * empty. Since __sync_single_inode() regains inode_lock before it finally moves
- * inode from superblock lists we are OK.
+ * CAREFUL! We mark it dirty unconditionally, but move it onto the
+ * dirty list only if it is hashed or if it refers to a blockdev.
+ * If it was not hashed, it will never be added to the dirty list
+ * even if it is later hashed, as it will have been marked dirty already.
*
- * If `older_than_this' is non-zero then only flush inodes which have a
- * flushtime older than *older_than_this.
+ * In short, make sure you hash any inodes _before_ you start marking
+ * them dirty.
*
- * If `bdi' is non-zero then we will scan the first inode against each
- * superblock until we find the matching ones. One group will be the dirty
- * inodes against a filesystem. Then when we hit the dummy blockdev superblock,
- * sync_sb_inodes will seekout the blockdev which matches `bdi'. Maybe not
- * super-efficient but we're about to do a ton of I/O...
+ * This function *must* be atomic for the I_DIRTY_PAGES case -
+ * set_page_dirty() is called under spinlock in several places.
+ *
+ * Note that for blockdevs, inode->dirtied_when represents the dirtying time of
+ * the block-special inode (/dev/hda1) itself. And the ->dirtied_when field of
+ * the kernel-internal blockdev inode represents the dirtying time of the
+ * blockdev's pages. This is why for I_DIRTY_PAGES we always use
+ * page->mapping->host, so the page-dirtying time is recorded in the internal
+ * blockdev inode.
*/
-void
-writeback_inodes(struct writeback_control *wbc)
+void __mark_inode_dirty(struct inode *inode, int flags)
{
- struct super_block *sb;
+ struct super_block *sb = inode->i_sb;
- might_sleep();
- spin_lock(&sb_lock);
-restart:
- list_for_each_entry_reverse(sb, &super_blocks, s_list) {
- if (sb_has_dirty_inodes(sb)) {
- /* we're making our own get_super here */
- sb->s_count++;
- spin_unlock(&sb_lock);
- /*
- * If we can't get the readlock, there's no sense in
- * waiting around, most of the time the FS is going to
- * be unmounted by the time it is released.
- */
- if (down_read_trylock(&sb->s_umount)) {
- if (sb->s_root)
- sync_sb_inodes(sb, wbc);
- up_read(&sb->s_umount);
+ /*
+ * Don't do this for I_DIRTY_PAGES - that doesn't actually
+ * dirty the inode itself
+ */
+ if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
+ if (sb->s_op->dirty_inode)
+ sb->s_op->dirty_inode(inode);
+ }
+
+ /*
+ * make sure that changes are seen by all cpus before we test i_state
+ * -- mikulas
+ */
+ smp_mb();
+
+ /* avoid the locking if we can */
+ if ((inode->i_state & flags) == flags)
+ return;
+
+ if (unlikely(block_dump))
+ block_dump___mark_inode_dirty(inode);
+
+ spin_lock(&inode_lock);
+ if ((inode->i_state & flags) != flags) {
+ const int was_dirty = inode->i_state & I_DIRTY;
+
+ inode->i_state |= flags;
+
+ /*
+ * If the inode is being synced, just update its dirty state.
+ * The unlocker will place the inode on the appropriate
+ * superblock list, based upon its state.
+ */
+ if (inode->i_state & I_SYNC)
+ goto out;
+
+ /*
+ * Only add valid (hashed) inodes to the superblock's
+ * dirty list. Add blockdev inodes as well.
+ */
+ if (!S_ISBLK(inode->i_mode)) {
+ if (hlist_unhashed(&inode->i_hash))
+ goto out;
+ }
+ if (inode->i_state & (I_FREEING|I_CLEAR))
+ goto out;
+
+ /*
+ * If the inode was already on b_dirty/b_io/b_more_io, don't
+ * reposition it (that would break b_dirty time-ordering).
+ */
+ if (!was_dirty) {
+ struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
+ struct backing_dev_info *bdi = wb->bdi;
+
+ if (bdi_cap_writeback_dirty(bdi) &&
+ !test_bit(BDI_registered, &bdi->state)) {
+ WARN_ON(1);
+ printk(KERN_ERR "bdi-%s not registered\n",
+ bdi->name);
}
- spin_lock(&sb_lock);
- if (__put_super_and_need_restart(sb))
- goto restart;
+
+ inode->dirtied_when = jiffies;
+ list_move(&inode->i_list, &wb->b_dirty);
}
- if (wbc->nr_to_write <= 0)
- break;
}
- spin_unlock(&sb_lock);
+out:
+ spin_unlock(&inode_lock);
}
+EXPORT_SYMBOL(__mark_inode_dirty);
/*
- * writeback and wait upon the filesystem's dirty inodes. The caller will
- * do this in two passes - one to write, and one to wait.
+ * Write out a superblock's list of dirty inodes. A wait will be performed
+ * upon no inodes, all inodes or the final one, depending upon sync_mode.
+ *
+ * If older_than_this is non-NULL, then only write out inodes which
+ * had their first dirtying at a time earlier than *older_than_this.
+ *
+ * If we're a pdlfush thread, then implement pdflush collision avoidance
+ * against the entire list.
*
- * A finite limit is set on the number of pages which will be written.
- * To prevent infinite livelock of sys_sync().
+ * If `bdi' is non-zero then we're being asked to writeback a specific queue.
+ * This function assumes that the blockdev superblock's inodes are backed by
+ * a variety of queues, so all inodes are searched. For other superblocks,
+ * assume that all inodes are backed by the same queue.
*
- * We add in the number of potentially dirty inodes, because each inode write
- * can dirty pagecache in the underlying blockdev.
+ * The inodes to be written are parked on bdi->b_io. They are moved back onto
+ * bdi->b_dirty as they are selected for writing. This way, none can be missed
+ * on the writer throttling path, and we get decent balancing between many
+ * throttled threads: we don't want them all piling up on inode_sync_wait.
*/
-void sync_inodes_sb(struct super_block *sb, int wait)
+static void wait_sb_inodes(struct super_block *sb)
{
- struct writeback_control wbc = {
- .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE,
- .range_start = 0,
- .range_end = LLONG_MAX,
- };
+ struct inode *inode, *old_inode = NULL;
+
+ /*
+ * We need to be protected against the filesystem going from
+ * r/o to r/w or vice versa.
+ */
+ WARN_ON(!rwsem_is_locked(&sb->s_umount));
+
+ spin_lock(&inode_lock);
+
+ /*
+ * Data integrity sync. Must wait for all pages under writeback,
+ * because there may have been pages dirtied before our sync
+ * call, but which had writeout started before we write it out.
+ * In which case, the inode may not be on the dirty list, but
+ * we still have to wait for that writeout.
+ */
+ list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
+ struct address_space *mapping;
+
+ if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
+ continue;
+ mapping = inode->i_mapping;
+ if (mapping->nrpages == 0)
+ continue;
+ __iget(inode);
+ spin_unlock(&inode_lock);
+ /*
+ * We hold a reference to 'inode' so it couldn't have
+ * been removed from s_inodes list while we dropped the
+ * inode_lock. We cannot iput the inode now as we can
+ * be holding the last reference and we cannot iput it
+ * under inode_lock. So we keep the reference and iput
+ * it later.
+ */
+ iput(old_inode);
+ old_inode = inode;
+
+ filemap_fdatawait(mapping);
+
+ cond_resched();
- if (!wait) {
- unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
- unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
+ spin_lock(&inode_lock);
+ }
+ spin_unlock(&inode_lock);
+ iput(old_inode);
+}
- wbc.nr_to_write = nr_dirty + nr_unstable +
+/**
+ * writeback_inodes_sb - writeback dirty inodes from given super_block
+ * @sb: the superblock
+ *
+ * Start writeback on some inodes on this super_block. No guarantees are made
+ * on how many (if any) will be written, and this function does not wait
+ * for IO completion of submitted IO. The number of pages submitted is
+ * returned.
+ */
+void writeback_inodes_sb(struct super_block *sb)
+{
+ unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
+ unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
+ long nr_to_write;
+
+ nr_to_write = nr_dirty + nr_unstable +
(inodes_stat.nr_inodes - inodes_stat.nr_unused);
- } else
- wbc.nr_to_write = LONG_MAX; /* doesn't actually matter */
- sync_sb_inodes(sb, &wbc);
+ bdi_writeback_all(sb, nr_to_write);
+}
+EXPORT_SYMBOL(writeback_inodes_sb);
+
+/**
+ * sync_inodes_sb - sync sb inode pages
+ * @sb: the superblock
+ *
+ * This function writes and waits on any dirty inode belonging to this
+ * super_block. The number of pages synced is returned.
+ */
+void sync_inodes_sb(struct super_block *sb)
+{
+ bdi_sync_writeback(sb->s_bdi, sb);
+ wait_sb_inodes(sb);
}
+EXPORT_SYMBOL(sync_inodes_sb);
/**
* write_inode_now - write an inode to disk
@@ -737,57 +1213,3 @@ int sync_inode(struct inode *inode, struct writeback_control *wbc)
return ret;
}
EXPORT_SYMBOL(sync_inode);
-
-/**
- * generic_osync_inode - flush all dirty data for a given inode to disk
- * @inode: inode to write
- * @mapping: the address_space that should be flushed
- * @what: what to write and wait upon
- *
- * This can be called by file_write functions for files which have the
- * O_SYNC flag set, to flush dirty writes to disk.
- *
- * @what is a bitmask, specifying which part of the inode's data should be
- * written and waited upon.
- *
- * OSYNC_DATA: i_mapping's dirty data
- * OSYNC_METADATA: the buffers at i_mapping->private_list
- * OSYNC_INODE: the inode itself
- */
-
-int generic_osync_inode(struct inode *inode, struct address_space *mapping, int what)
-{
- int err = 0;
- int need_write_inode_now = 0;
- int err2;
-
- if (what & OSYNC_DATA)
- err = filemap_fdatawrite(mapping);
- if (what & (OSYNC_METADATA|OSYNC_DATA)) {
- err2 = sync_mapping_buffers(mapping);
- if (!err)
- err = err2;
- }
- if (what & OSYNC_DATA) {
- err2 = filemap_fdatawait(mapping);
- if (!err)
- err = err2;
- }
-
- spin_lock(&inode_lock);
- if ((inode->i_state & I_DIRTY) &&
- ((what & OSYNC_INODE) || (inode->i_state & I_DIRTY_DATASYNC)))
- need_write_inode_now = 1;
- spin_unlock(&inode_lock);
-
- if (need_write_inode_now) {
- err2 = write_inode_now(inode, 1);
- if (!err)
- err = err2;
- }
- else
- inode_sync_wait(inode);
-
- return err;
-}
-EXPORT_SYMBOL(generic_osync_inode);
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index 99c99dfb0373..3773fd63d2f9 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -61,6 +61,121 @@ static ssize_t fuse_conn_waiting_read(struct file *file, char __user *buf,
return simple_read_from_buffer(buf, len, ppos, tmp, size);
}
+static ssize_t fuse_conn_limit_read(struct file *file, char __user *buf,
+ size_t len, loff_t *ppos, unsigned val)
+{
+ char tmp[32];
+ size_t size = sprintf(tmp, "%u\n", val);
+
+ return simple_read_from_buffer(buf, len, ppos, tmp, size);
+}
+
+static ssize_t fuse_conn_limit_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos, unsigned *val,
+ unsigned global_limit)
+{
+ unsigned long t;
+ char tmp[32];
+ unsigned limit = (1 << 16) - 1;
+ int err;
+
+ if (*ppos || count >= sizeof(tmp) - 1)
+ return -EINVAL;
+
+ if (copy_from_user(tmp, buf, count))
+ return -EINVAL;
+
+ tmp[count] = '\0';
+
+ err = strict_strtoul(tmp, 0, &t);
+ if (err)
+ return err;
+
+ if (!capable(CAP_SYS_ADMIN))
+ limit = min(limit, global_limit);
+
+ if (t > limit)
+ return -EINVAL;
+
+ *val = t;
+
+ return count;
+}
+
+static ssize_t fuse_conn_max_background_read(struct file *file,
+ char __user *buf, size_t len,
+ loff_t *ppos)
+{
+ struct fuse_conn *fc;
+ unsigned val;
+
+ fc = fuse_ctl_file_conn_get(file);
+ if (!fc)
+ return 0;
+
+ val = fc->max_background;
+ fuse_conn_put(fc);
+
+ return fuse_conn_limit_read(file, buf, len, ppos, val);
+}
+
+static ssize_t fuse_conn_max_background_write(struct file *file,
+ const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ unsigned val;
+ ssize_t ret;
+
+ ret = fuse_conn_limit_write(file, buf, count, ppos, &val,
+ max_user_bgreq);
+ if (ret > 0) {
+ struct fuse_conn *fc = fuse_ctl_file_conn_get(file);
+ if (fc) {
+ fc->max_background = val;
+ fuse_conn_put(fc);
+ }
+ }
+
+ return ret;
+}
+
+static ssize_t fuse_conn_congestion_threshold_read(struct file *file,
+ char __user *buf, size_t len,
+ loff_t *ppos)
+{
+ struct fuse_conn *fc;
+ unsigned val;
+
+ fc = fuse_ctl_file_conn_get(file);
+ if (!fc)
+ return 0;
+
+ val = fc->congestion_threshold;
+ fuse_conn_put(fc);
+
+ return fuse_conn_limit_read(file, buf, len, ppos, val);
+}
+
+static ssize_t fuse_conn_congestion_threshold_write(struct file *file,
+ const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ unsigned val;
+ ssize_t ret;
+
+ ret = fuse_conn_limit_write(file, buf, count, ppos, &val,
+ max_user_congthresh);
+ if (ret > 0) {
+ struct fuse_conn *fc = fuse_ctl_file_conn_get(file);
+ if (fc) {
+ fc->congestion_threshold = val;
+ fuse_conn_put(fc);
+ }
+ }
+
+ return ret;
+}
+
static const struct file_operations fuse_ctl_abort_ops = {
.open = nonseekable_open,
.write = fuse_conn_abort_write,
@@ -71,6 +186,18 @@ static const struct file_operations fuse_ctl_waiting_ops = {
.read = fuse_conn_waiting_read,
};
+static const struct file_operations fuse_conn_max_background_ops = {
+ .open = nonseekable_open,
+ .read = fuse_conn_max_background_read,
+ .write = fuse_conn_max_background_write,
+};
+
+static const struct file_operations fuse_conn_congestion_threshold_ops = {
+ .open = nonseekable_open,
+ .read = fuse_conn_congestion_threshold_read,
+ .write = fuse_conn_congestion_threshold_write,
+};
+
static struct dentry *fuse_ctl_add_dentry(struct dentry *parent,
struct fuse_conn *fc,
const char *name,
@@ -127,9 +254,14 @@ int fuse_ctl_add_conn(struct fuse_conn *fc)
goto err;
if (!fuse_ctl_add_dentry(parent, fc, "waiting", S_IFREG | 0400, 1,
- NULL, &fuse_ctl_waiting_ops) ||
+ NULL, &fuse_ctl_waiting_ops) ||
!fuse_ctl_add_dentry(parent, fc, "abort", S_IFREG | 0200, 1,
- NULL, &fuse_ctl_abort_ops))
+ NULL, &fuse_ctl_abort_ops) ||
+ !fuse_ctl_add_dentry(parent, fc, "max_background", S_IFREG | 0600,
+ 1, NULL, &fuse_conn_max_background_ops) ||
+ !fuse_ctl_add_dentry(parent, fc, "congestion_threshold",
+ S_IFREG | 0600, 1, NULL,
+ &fuse_conn_congestion_threshold_ops))
goto err;
return 0;
@@ -156,7 +288,7 @@ void fuse_ctl_remove_conn(struct fuse_conn *fc)
d_drop(dentry);
dput(dentry);
}
- fuse_control_sb->s_root->d_inode->i_nlink--;
+ drop_nlink(fuse_control_sb->s_root->d_inode);
}
static int fuse_ctl_fill_super(struct super_block *sb, void *data, int silent)
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 6484eb75acd6..51d9e33d634f 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -250,7 +250,7 @@ static void queue_request(struct fuse_conn *fc, struct fuse_req *req)
static void flush_bg_queue(struct fuse_conn *fc)
{
- while (fc->active_background < FUSE_MAX_BACKGROUND &&
+ while (fc->active_background < fc->max_background &&
!list_empty(&fc->bg_queue)) {
struct fuse_req *req;
@@ -280,11 +280,11 @@ __releases(&fc->lock)
list_del(&req->intr_entry);
req->state = FUSE_REQ_FINISHED;
if (req->background) {
- if (fc->num_background == FUSE_MAX_BACKGROUND) {
+ if (fc->num_background == fc->max_background) {
fc->blocked = 0;
wake_up_all(&fc->blocked_waitq);
}
- if (fc->num_background == FUSE_CONGESTION_THRESHOLD &&
+ if (fc->num_background == fc->congestion_threshold &&
fc->connected && fc->bdi_initialized) {
clear_bdi_congested(&fc->bdi, BLK_RW_SYNC);
clear_bdi_congested(&fc->bdi, BLK_RW_ASYNC);
@@ -410,9 +410,9 @@ static void fuse_request_send_nowait_locked(struct fuse_conn *fc,
{
req->background = 1;
fc->num_background++;
- if (fc->num_background == FUSE_MAX_BACKGROUND)
+ if (fc->num_background == fc->max_background)
fc->blocked = 1;
- if (fc->num_background == FUSE_CONGESTION_THRESHOLD &&
+ if (fc->num_background == fc->congestion_threshold &&
fc->bdi_initialized) {
set_bdi_congested(&fc->bdi, BLK_RW_SYNC);
set_bdi_congested(&fc->bdi, BLK_RW_ASYNC);
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 52b641fc0faf..fc9c79feb5f7 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -25,12 +25,6 @@
/** Max number of pages that can be used in a single read request */
#define FUSE_MAX_PAGES_PER_REQ 32
-/** Maximum number of outstanding background requests */
-#define FUSE_MAX_BACKGROUND 12
-
-/** Congestion starts at 75% of maximum */
-#define FUSE_CONGESTION_THRESHOLD (FUSE_MAX_BACKGROUND * 75 / 100)
-
/** Bias for fi->writectr, meaning new writepages must not be sent */
#define FUSE_NOWRITE INT_MIN
@@ -38,7 +32,7 @@
#define FUSE_NAME_MAX 1024
/** Number of dentries for each connection in the control filesystem */
-#define FUSE_CTL_NUM_DENTRIES 3
+#define FUSE_CTL_NUM_DENTRIES 5
/** If the FUSE_DEFAULT_PERMISSIONS flag is given, the filesystem
module will check permissions based on the file mode. Otherwise no
@@ -55,6 +49,10 @@ extern struct list_head fuse_conn_list;
/** Global mutex protecting fuse_conn_list and the control filesystem */
extern struct mutex fuse_mutex;
+/** Module parameters */
+extern unsigned max_user_bgreq;
+extern unsigned max_user_congthresh;
+
/** FUSE inode */
struct fuse_inode {
/** Inode data */
@@ -349,6 +347,12 @@ struct fuse_conn {
/** rbtree of fuse_files waiting for poll events indexed by ph */
struct rb_root polled_files;
+ /** Maximum number of outstanding background requests */
+ unsigned max_background;
+
+ /** Number of background requests at which congestion starts */
+ unsigned congestion_threshold;
+
/** Number of requests currently in the background */
unsigned num_background;
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index f91ccc4a189d..6da947daabda 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -14,6 +14,7 @@
#include <linux/seq_file.h>
#include <linux/init.h>
#include <linux/module.h>
+#include <linux/moduleparam.h>
#include <linux/parser.h>
#include <linux/statfs.h>
#include <linux/random.h>
@@ -28,10 +29,34 @@ static struct kmem_cache *fuse_inode_cachep;
struct list_head fuse_conn_list;
DEFINE_MUTEX(fuse_mutex);
+static int set_global_limit(const char *val, struct kernel_param *kp);
+
+unsigned max_user_bgreq;
+module_param_call(max_user_bgreq, set_global_limit, param_get_uint,
+ &max_user_bgreq, 0644);
+__MODULE_PARM_TYPE(max_user_bgreq, "uint");
+MODULE_PARM_DESC(max_user_bgreq,
+ "Global limit for the maximum number of backgrounded requests an "
+ "unprivileged user can set");
+
+unsigned max_user_congthresh;
+module_param_call(max_user_congthresh, set_global_limit, param_get_uint,
+ &max_user_congthresh, 0644);
+__MODULE_PARM_TYPE(max_user_congthresh, "uint");
+MODULE_PARM_DESC(max_user_congthresh,
+ "Global limit for the maximum congestion threshold an "
+ "unprivileged user can set");
+
#define FUSE_SUPER_MAGIC 0x65735546
#define FUSE_DEFAULT_BLKSIZE 512
+/** Maximum number of outstanding background requests */
+#define FUSE_DEFAULT_MAX_BACKGROUND 12
+
+/** Congestion starts at 75% of maximum */
+#define FUSE_DEFAULT_CONGESTION_THRESHOLD (FUSE_DEFAULT_MAX_BACKGROUND * 3 / 4)
+
struct fuse_mount_data {
int fd;
unsigned rootmode;
@@ -517,6 +542,8 @@ void fuse_conn_init(struct fuse_conn *fc)
INIT_LIST_HEAD(&fc->bg_queue);
INIT_LIST_HEAD(&fc->entry);
atomic_set(&fc->num_waiting, 0);
+ fc->max_background = FUSE_DEFAULT_MAX_BACKGROUND;
+ fc->congestion_threshold = FUSE_DEFAULT_CONGESTION_THRESHOLD;
fc->khctr = 0;
fc->polled_files = RB_ROOT;
fc->reqctr = 0;
@@ -727,6 +754,54 @@ static const struct super_operations fuse_super_operations = {
.show_options = fuse_show_options,
};
+static void sanitize_global_limit(unsigned *limit)
+{
+ if (*limit == 0)
+ *limit = ((num_physpages << PAGE_SHIFT) >> 13) /
+ sizeof(struct fuse_req);
+
+ if (*limit >= 1 << 16)
+ *limit = (1 << 16) - 1;
+}
+
+static int set_global_limit(const char *val, struct kernel_param *kp)
+{
+ int rv;
+
+ rv = param_set_uint(val, kp);
+ if (rv)
+ return rv;
+
+ sanitize_global_limit((unsigned *)kp->arg);
+
+ return 0;
+}
+
+static void process_init_limits(struct fuse_conn *fc, struct fuse_init_out *arg)
+{
+ int cap_sys_admin = capable(CAP_SYS_ADMIN);
+
+ if (arg->minor < 13)
+ return;
+
+ sanitize_global_limit(&max_user_bgreq);
+ sanitize_global_limit(&max_user_congthresh);
+
+ if (arg->max_background) {
+ fc->max_background = arg->max_background;
+
+ if (!cap_sys_admin && fc->max_background > max_user_bgreq)
+ fc->max_background = max_user_bgreq;
+ }
+ if (arg->congestion_threshold) {
+ fc->congestion_threshold = arg->congestion_threshold;
+
+ if (!cap_sys_admin &&
+ fc->congestion_threshold > max_user_congthresh)
+ fc->congestion_threshold = max_user_congthresh;
+ }
+}
+
static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
{
struct fuse_init_out *arg = &req->misc.init_out;
@@ -736,6 +811,8 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
else {
unsigned long ra_pages;
+ process_init_limits(fc, arg);
+
if (arg->minor >= 6) {
ra_pages = arg->max_readahead / PAGE_CACHE_SIZE;
if (arg->flags & FUSE_ASYNC_READ)
@@ -801,6 +878,7 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb)
{
int err;
+ fc->bdi.name = "fuse";
fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
fc->bdi.unplug_io_fn = default_unplug_io_fn;
/* fuse does it's own writeback accounting */
@@ -893,6 +971,8 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
if (err)
goto err_put_conn;
+ sb->s_bdi = &fc->bdi;
+
/* Handle umasking inside the fuse code */
if (sb->s_flags & MS_POSIXACL)
fc->dont_mask = 1;
@@ -1147,6 +1227,9 @@ static int __init fuse_init(void)
if (res)
goto err_sysfs_cleanup;
+ sanitize_global_limit(&max_user_bgreq);
+ sanitize_global_limit(&max_user_congthresh);
+
return 0;
err_sysfs_cleanup:
diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile
index 3da2f1f4f738..21f7e46da4c0 100644
--- a/fs/gfs2/Makefile
+++ b/fs/gfs2/Makefile
@@ -1,6 +1,6 @@
EXTRA_CFLAGS := -I$(src)
obj-$(CONFIG_GFS2_FS) += gfs2.o
-gfs2-y := acl.o bmap.o dir.o eaops.o eattr.o glock.o \
+gfs2-y := acl.o bmap.o dir.o xattr.o glock.o \
glops.o inode.o log.o lops.o main.o meta_io.o \
aops.o dentry.o export.o file.o \
ops_fstype.o ops_inode.o quota.o \
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index fa881bdc3d85..3fc4e3ac7d84 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -19,8 +19,7 @@
#include "gfs2.h"
#include "incore.h"
#include "acl.h"
-#include "eaops.h"
-#include "eattr.h"
+#include "xattr.h"
#include "glock.h"
#include "inode.h"
#include "meta_io.h"
@@ -31,8 +30,7 @@
#define ACL_DEFAULT 0
int gfs2_acl_validate_set(struct gfs2_inode *ip, int access,
- struct gfs2_ea_request *er,
- int *remove, mode_t *mode)
+ struct gfs2_ea_request *er, int *remove, mode_t *mode)
{
struct posix_acl *acl;
int error;
@@ -83,30 +81,20 @@ int gfs2_acl_validate_remove(struct gfs2_inode *ip, int access)
return 0;
}
-static int acl_get(struct gfs2_inode *ip, int access, struct posix_acl **acl,
- struct gfs2_ea_location *el, char **data, unsigned int *len)
+static int acl_get(struct gfs2_inode *ip, const char *name,
+ struct posix_acl **acl, struct gfs2_ea_location *el,
+ char **datap, unsigned int *lenp)
{
- struct gfs2_ea_request er;
- struct gfs2_ea_location el_this;
+ char *data;
+ unsigned int len;
int error;
+ el->el_bh = NULL;
+
if (!ip->i_eattr)
return 0;
- memset(&er, 0, sizeof(struct gfs2_ea_request));
- if (access) {
- er.er_name = GFS2_POSIX_ACL_ACCESS;
- er.er_name_len = GFS2_POSIX_ACL_ACCESS_LEN;
- } else {
- er.er_name = GFS2_POSIX_ACL_DEFAULT;
- er.er_name_len = GFS2_POSIX_ACL_DEFAULT_LEN;
- }
- er.er_type = GFS2_EATYPE_SYS;
-
- if (!el)
- el = &el_this;
-
- error = gfs2_ea_find(ip, &er, el);
+ error = gfs2_ea_find(ip, GFS2_EATYPE_SYS, name, el);
if (error)
return error;
if (!el->el_ea)
@@ -114,32 +102,31 @@ static int acl_get(struct gfs2_inode *ip, int access, struct posix_acl **acl,
if (!GFS2_EA_DATA_LEN(el->el_ea))
goto out;
- er.er_data_len = GFS2_EA_DATA_LEN(el->el_ea);
- er.er_data = kmalloc(er.er_data_len, GFP_NOFS);
+ len = GFS2_EA_DATA_LEN(el->el_ea);
+ data = kmalloc(len, GFP_NOFS);
error = -ENOMEM;
- if (!er.er_data)
+ if (!data)
goto out;
- error = gfs2_ea_get_copy(ip, el, er.er_data);
- if (error)
+ error = gfs2_ea_get_copy(ip, el, data, len);
+ if (error < 0)
goto out_kfree;
+ error = 0;
if (acl) {
- *acl = posix_acl_from_xattr(er.er_data, er.er_data_len);
+ *acl = posix_acl_from_xattr(data, len);
if (IS_ERR(*acl))
error = PTR_ERR(*acl);
}
out_kfree:
- if (error || !data)
- kfree(er.er_data);
- else {
- *data = er.er_data;
- *len = er.er_data_len;
+ if (error || !datap) {
+ kfree(data);
+ } else {
+ *datap = data;
+ *lenp = len;
}
out:
- if (error || el == &el_this)
- brelse(el->el_bh);
return error;
}
@@ -153,10 +140,12 @@ out:
int gfs2_check_acl(struct inode *inode, int mask)
{
+ struct gfs2_ea_location el;
struct posix_acl *acl = NULL;
int error;
- error = acl_get(GFS2_I(inode), ACL_ACCESS, &acl, NULL, NULL, NULL);
+ error = acl_get(GFS2_I(inode), GFS2_POSIX_ACL_ACCESS, &acl, &el, NULL, NULL);
+ brelse(el.el_bh);
if (error)
return error;
@@ -196,10 +185,12 @@ static int munge_mode(struct gfs2_inode *ip, mode_t mode)
int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip)
{
+ struct gfs2_ea_location el;
struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
struct posix_acl *acl = NULL, *clone;
- struct gfs2_ea_request er;
mode_t mode = ip->i_inode.i_mode;
+ char *data = NULL;
+ unsigned int len;
int error;
if (!sdp->sd_args.ar_posix_acl)
@@ -207,11 +198,8 @@ int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip)
if (S_ISLNK(ip->i_inode.i_mode))
return 0;
- memset(&er, 0, sizeof(struct gfs2_ea_request));
- er.er_type = GFS2_EATYPE_SYS;
-
- error = acl_get(dip, ACL_DEFAULT, &acl, NULL,
- &er.er_data, &er.er_data_len);
+ error = acl_get(dip, GFS2_POSIX_ACL_DEFAULT, &acl, &el, &data, &len);
+ brelse(el.el_bh);
if (error)
return error;
if (!acl) {
@@ -229,9 +217,8 @@ int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip)
acl = clone;
if (S_ISDIR(ip->i_inode.i_mode)) {
- er.er_name = GFS2_POSIX_ACL_DEFAULT;
- er.er_name_len = GFS2_POSIX_ACL_DEFAULT_LEN;
- error = gfs2_system_eaops.eo_set(ip, &er);
+ error = gfs2_xattr_set(&ip->i_inode, GFS2_EATYPE_SYS,
+ GFS2_POSIX_ACL_DEFAULT, data, len, 0);
if (error)
goto out;
}
@@ -239,21 +226,19 @@ int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip)
error = posix_acl_create_masq(acl, &mode);
if (error < 0)
goto out;
- if (error > 0) {
- er.er_name = GFS2_POSIX_ACL_ACCESS;
- er.er_name_len = GFS2_POSIX_ACL_ACCESS_LEN;
- posix_acl_to_xattr(acl, er.er_data, er.er_data_len);
- er.er_mode = mode;
- er.er_flags = GFS2_ERF_MODE;
- error = gfs2_system_eaops.eo_set(ip, &er);
- if (error)
- goto out;
- } else
- munge_mode(ip, mode);
+ if (error == 0)
+ goto munge;
+ posix_acl_to_xattr(acl, data, len);
+ error = gfs2_xattr_set(&ip->i_inode, GFS2_EATYPE_SYS,
+ GFS2_POSIX_ACL_ACCESS, data, len, 0);
+ if (error)
+ goto out;
+munge:
+ error = munge_mode(ip, mode);
out:
posix_acl_release(acl);
- kfree(er.er_data);
+ kfree(data);
return error;
}
@@ -265,9 +250,9 @@ int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr)
unsigned int len;
int error;
- error = acl_get(ip, ACL_ACCESS, &acl, &el, &data, &len);
+ error = acl_get(ip, GFS2_POSIX_ACL_ACCESS, &acl, &el, &data, &len);
if (error)
- return error;
+ goto out_brelse;
if (!acl)
return gfs2_setattr_simple(ip, attr);
@@ -286,8 +271,9 @@ int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr)
out:
posix_acl_release(acl);
- brelse(el.el_bh);
kfree(data);
+out_brelse:
+ brelse(el.el_bh);
return error;
}
diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c
index 022c66cd5606..91beddadd388 100644
--- a/fs/gfs2/dentry.c
+++ b/fs/gfs2/dentry.c
@@ -107,8 +107,26 @@ static int gfs2_dhash(struct dentry *dentry, struct qstr *str)
return 0;
}
+static int gfs2_dentry_delete(struct dentry *dentry)
+{
+ struct gfs2_inode *ginode;
+
+ if (!dentry->d_inode)
+ return 0;
+
+ ginode = GFS2_I(dentry->d_inode);
+ if (!ginode->i_iopen_gh.gh_gl)
+ return 0;
+
+ if (test_bit(GLF_DEMOTE, &ginode->i_iopen_gh.gh_gl->gl_flags))
+ return 1;
+
+ return 0;
+}
+
const struct dentry_operations gfs2_dops = {
.d_revalidate = gfs2_drevalidate,
.d_hash = gfs2_dhash,
+ .d_delete = gfs2_dentry_delete,
};
diff --git a/fs/gfs2/eaops.c b/fs/gfs2/eaops.c
deleted file mode 100644
index dee9b03e5b37..000000000000
--- a/fs/gfs2/eaops.c
+++ /dev/null
@@ -1,157 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/completion.h>
-#include <linux/buffer_head.h>
-#include <linux/capability.h>
-#include <linux/xattr.h>
-#include <linux/gfs2_ondisk.h>
-#include <asm/uaccess.h>
-
-#include "gfs2.h"
-#include "incore.h"
-#include "acl.h"
-#include "eaops.h"
-#include "eattr.h"
-#include "util.h"
-
-/**
- * gfs2_ea_name2type - get the type of the ea, and truncate type from the name
- * @namep: ea name, possibly with type appended
- *
- * Returns: GFS2_EATYPE_XXX
- */
-
-unsigned int gfs2_ea_name2type(const char *name, const char **truncated_name)
-{
- unsigned int type;
-
- if (strncmp(name, "system.", 7) == 0) {
- type = GFS2_EATYPE_SYS;
- if (truncated_name)
- *truncated_name = name + sizeof("system.") - 1;
- } else if (strncmp(name, "user.", 5) == 0) {
- type = GFS2_EATYPE_USR;
- if (truncated_name)
- *truncated_name = name + sizeof("user.") - 1;
- } else if (strncmp(name, "security.", 9) == 0) {
- type = GFS2_EATYPE_SECURITY;
- if (truncated_name)
- *truncated_name = name + sizeof("security.") - 1;
- } else {
- type = GFS2_EATYPE_UNUSED;
- if (truncated_name)
- *truncated_name = NULL;
- }
-
- return type;
-}
-
-static int system_eo_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
-{
- if (!GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len) &&
- !GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len) &&
- !capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- if (GFS2_SB(&ip->i_inode)->sd_args.ar_posix_acl == 0 &&
- (GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len) ||
- GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len)))
- return -EOPNOTSUPP;
-
- return gfs2_ea_get_i(ip, er);
-}
-
-static int system_eo_set(struct gfs2_inode *ip, struct gfs2_ea_request *er)
-{
- int remove = 0;
- int error;
-
- if (GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len)) {
- if (!(er->er_flags & GFS2_ERF_MODE)) {
- er->er_mode = ip->i_inode.i_mode;
- er->er_flags |= GFS2_ERF_MODE;
- }
- error = gfs2_acl_validate_set(ip, 1, er,
- &remove, &er->er_mode);
- if (error)
- return error;
- error = gfs2_ea_set_i(ip, er);
- if (error)
- return error;
- if (remove)
- gfs2_ea_remove_i(ip, er);
- return 0;
-
- } else if (GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len)) {
- error = gfs2_acl_validate_set(ip, 0, er,
- &remove, NULL);
- if (error)
- return error;
- if (!remove)
- error = gfs2_ea_set_i(ip, er);
- else {
- error = gfs2_ea_remove_i(ip, er);
- if (error == -ENODATA)
- error = 0;
- }
- return error;
- }
-
- return -EPERM;
-}
-
-static int system_eo_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er)
-{
- if (GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len)) {
- int error = gfs2_acl_validate_remove(ip, 1);
- if (error)
- return error;
-
- } else if (GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len)) {
- int error = gfs2_acl_validate_remove(ip, 0);
- if (error)
- return error;
-
- } else
- return -EPERM;
-
- return gfs2_ea_remove_i(ip, er);
-}
-
-static const struct gfs2_eattr_operations gfs2_user_eaops = {
- .eo_get = gfs2_ea_get_i,
- .eo_set = gfs2_ea_set_i,
- .eo_remove = gfs2_ea_remove_i,
- .eo_name = "user",
-};
-
-const struct gfs2_eattr_operations gfs2_system_eaops = {
- .eo_get = system_eo_get,
- .eo_set = system_eo_set,
- .eo_remove = system_eo_remove,
- .eo_name = "system",
-};
-
-static const struct gfs2_eattr_operations gfs2_security_eaops = {
- .eo_get = gfs2_ea_get_i,
- .eo_set = gfs2_ea_set_i,
- .eo_remove = gfs2_ea_remove_i,
- .eo_name = "security",
-};
-
-const struct gfs2_eattr_operations *gfs2_ea_ops[] = {
- NULL,
- &gfs2_user_eaops,
- &gfs2_system_eaops,
- &gfs2_security_eaops,
-};
-
diff --git a/fs/gfs2/eaops.h b/fs/gfs2/eaops.h
deleted file mode 100644
index da2f7fbbb40d..000000000000
--- a/fs/gfs2/eaops.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-
-#ifndef __EAOPS_DOT_H__
-#define __EAOPS_DOT_H__
-
-struct gfs2_ea_request;
-struct gfs2_inode;
-
-struct gfs2_eattr_operations {
- int (*eo_get) (struct gfs2_inode *ip, struct gfs2_ea_request *er);
- int (*eo_set) (struct gfs2_inode *ip, struct gfs2_ea_request *er);
- int (*eo_remove) (struct gfs2_inode *ip, struct gfs2_ea_request *er);
- char *eo_name;
-};
-
-unsigned int gfs2_ea_name2type(const char *name, const char **truncated_name);
-
-extern const struct gfs2_eattr_operations gfs2_system_eaops;
-
-extern const struct gfs2_eattr_operations *gfs2_ea_ops[];
-
-#endif /* __EAOPS_DOT_H__ */
-
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index 9200ef221716..d15876e9aa26 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -143,17 +143,14 @@ static struct dentry *gfs2_get_parent(struct dentry *child)
}
static struct dentry *gfs2_get_dentry(struct super_block *sb,
- struct gfs2_inum_host *inum)
+ struct gfs2_inum_host *inum)
{
struct gfs2_sbd *sdp = sb->s_fs_info;
- struct gfs2_holder i_gh, ri_gh, rgd_gh;
- struct gfs2_rgrpd *rgd;
+ struct gfs2_holder i_gh;
struct inode *inode;
struct dentry *dentry;
int error;
- /* System files? */
-
inode = gfs2_ilookup(sb, inum->no_addr);
if (inode) {
if (GFS2_I(inode)->i_no_formal_ino != inum->no_formal_ino) {
@@ -168,29 +165,11 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb,
if (error)
return ERR_PTR(error);
- error = gfs2_rindex_hold(sdp, &ri_gh);
+ error = gfs2_check_blk_type(sdp, inum->no_addr, GFS2_BLKST_DINODE);
if (error)
goto fail;
- error = -EINVAL;
- rgd = gfs2_blk2rgrpd(sdp, inum->no_addr);
- if (!rgd)
- goto fail_rindex;
-
- error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_SHARED, 0, &rgd_gh);
- if (error)
- goto fail_rindex;
-
- error = -ESTALE;
- if (gfs2_get_block_type(rgd, inum->no_addr) != GFS2_BLKST_DINODE)
- goto fail_rgd;
-
- gfs2_glock_dq_uninit(&rgd_gh);
- gfs2_glock_dq_uninit(&ri_gh);
-
- inode = gfs2_inode_lookup(sb, DT_UNKNOWN,
- inum->no_addr,
- 0, 0);
+ inode = gfs2_inode_lookup(sb, DT_UNKNOWN, inum->no_addr, 0, 0);
if (IS_ERR(inode)) {
error = PTR_ERR(inode);
goto fail;
@@ -224,13 +203,6 @@ out_inode:
if (!IS_ERR(dentry))
dentry->d_op = &gfs2_dops;
return dentry;
-
-fail_rgd:
- gfs2_glock_dq_uninit(&rgd_gh);
-
-fail_rindex:
- gfs2_glock_dq_uninit(&ri_gh);
-
fail:
gfs2_glock_dq_uninit(&i_gh);
return ERR_PTR(error);
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 73318a3ce6f1..166f38fbd246 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -38,7 +38,6 @@
#include "rgrp.h"
#include "trans.h"
#include "util.h"
-#include "eaops.h"
/**
* gfs2_llseek - seek to a location in a file
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 61801ada36f0..6edb423f90b3 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -406,6 +406,12 @@ struct gfs2_statfs_change_host {
#define GFS2_DATA_WRITEBACK 1
#define GFS2_DATA_ORDERED 2
+#define GFS2_ERRORS_DEFAULT GFS2_ERRORS_WITHDRAW
+#define GFS2_ERRORS_WITHDRAW 0
+#define GFS2_ERRORS_CONTINUE 1 /* place holder for future feature */
+#define GFS2_ERRORS_RO 2 /* place holder for future feature */
+#define GFS2_ERRORS_PANIC 3
+
struct gfs2_args {
char ar_lockproto[GFS2_LOCKNAME_LEN]; /* Name of the Lock Protocol */
char ar_locktable[GFS2_LOCKNAME_LEN]; /* Name of the Lock Table */
@@ -422,6 +428,7 @@ struct gfs2_args {
unsigned int ar_data:2; /* ordered/writeback */
unsigned int ar_meta:1; /* mount metafs */
unsigned int ar_discard:1; /* discard requests */
+ unsigned int ar_errors:2; /* errors=withdraw | panic */
int ar_commit; /* Commit interval */
};
@@ -489,7 +496,6 @@ struct gfs2_sb_host {
*/
struct lm_lockstruct {
- u32 ls_id;
unsigned int ls_jid;
unsigned int ls_first;
unsigned int ls_first_done;
@@ -541,18 +547,12 @@ struct gfs2_sbd {
struct dentry *sd_root_dir;
struct inode *sd_jindex;
- struct inode *sd_inum_inode;
struct inode *sd_statfs_inode;
- struct inode *sd_ir_inode;
struct inode *sd_sc_inode;
struct inode *sd_qc_inode;
struct inode *sd_rindex;
struct inode *sd_quota_inode;
- /* Inum stuff */
-
- struct mutex sd_inum_mutex;
-
/* StatFS stuff */
spinlock_t sd_statfs_spin;
@@ -580,7 +580,6 @@ struct gfs2_sbd {
struct gfs2_holder sd_journal_gh;
struct gfs2_holder sd_jinode_gh;
- struct gfs2_holder sd_ir_gh;
struct gfs2_holder sd_sc_gh;
struct gfs2_holder sd_qc_gh;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 2f94bd723698..fb15d3b1f409 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -24,7 +24,7 @@
#include "acl.h"
#include "bmap.h"
#include "dir.h"
-#include "eattr.h"
+#include "xattr.h"
#include "glock.h"
#include "glops.h"
#include "inode.h"
@@ -519,139 +519,6 @@ out:
return inode ? inode : ERR_PTR(error);
}
-static void gfs2_inum_range_in(struct gfs2_inum_range_host *ir, const void *buf)
-{
- const struct gfs2_inum_range *str = buf;
-
- ir->ir_start = be64_to_cpu(str->ir_start);
- ir->ir_length = be64_to_cpu(str->ir_length);
-}
-
-static void gfs2_inum_range_out(const struct gfs2_inum_range_host *ir, void *buf)
-{
- struct gfs2_inum_range *str = buf;
-
- str->ir_start = cpu_to_be64(ir->ir_start);
- str->ir_length = cpu_to_be64(ir->ir_length);
-}
-
-static int pick_formal_ino_1(struct gfs2_sbd *sdp, u64 *formal_ino)
-{
- struct gfs2_inode *ip = GFS2_I(sdp->sd_ir_inode);
- struct buffer_head *bh;
- struct gfs2_inum_range_host ir;
- int error;
-
- error = gfs2_trans_begin(sdp, RES_DINODE, 0);
- if (error)
- return error;
- mutex_lock(&sdp->sd_inum_mutex);
-
- error = gfs2_meta_inode_buffer(ip, &bh);
- if (error) {
- mutex_unlock(&sdp->sd_inum_mutex);
- gfs2_trans_end(sdp);
- return error;
- }
-
- gfs2_inum_range_in(&ir, bh->b_data + sizeof(struct gfs2_dinode));
-
- if (ir.ir_length) {
- *formal_ino = ir.ir_start++;
- ir.ir_length--;
- gfs2_trans_add_bh(ip->i_gl, bh, 1);
- gfs2_inum_range_out(&ir,
- bh->b_data + sizeof(struct gfs2_dinode));
- brelse(bh);
- mutex_unlock(&sdp->sd_inum_mutex);
- gfs2_trans_end(sdp);
- return 0;
- }
-
- brelse(bh);
-
- mutex_unlock(&sdp->sd_inum_mutex);
- gfs2_trans_end(sdp);
-
- return 1;
-}
-
-static int pick_formal_ino_2(struct gfs2_sbd *sdp, u64 *formal_ino)
-{
- struct gfs2_inode *ip = GFS2_I(sdp->sd_ir_inode);
- struct gfs2_inode *m_ip = GFS2_I(sdp->sd_inum_inode);
- struct gfs2_holder gh;
- struct buffer_head *bh;
- struct gfs2_inum_range_host ir;
- int error;
-
- error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
- if (error)
- return error;
-
- error = gfs2_trans_begin(sdp, 2 * RES_DINODE, 0);
- if (error)
- goto out;
- mutex_lock(&sdp->sd_inum_mutex);
-
- error = gfs2_meta_inode_buffer(ip, &bh);
- if (error)
- goto out_end_trans;
-
- gfs2_inum_range_in(&ir, bh->b_data + sizeof(struct gfs2_dinode));
-
- if (!ir.ir_length) {
- struct buffer_head *m_bh;
- u64 x, y;
- __be64 z;
-
- error = gfs2_meta_inode_buffer(m_ip, &m_bh);
- if (error)
- goto out_brelse;
-
- z = *(__be64 *)(m_bh->b_data + sizeof(struct gfs2_dinode));
- x = y = be64_to_cpu(z);
- ir.ir_start = x;
- ir.ir_length = GFS2_INUM_QUANTUM;
- x += GFS2_INUM_QUANTUM;
- if (x < y)
- gfs2_consist_inode(m_ip);
- z = cpu_to_be64(x);
- gfs2_trans_add_bh(m_ip->i_gl, m_bh, 1);
- *(__be64 *)(m_bh->b_data + sizeof(struct gfs2_dinode)) = z;
-
- brelse(m_bh);
- }
-
- *formal_ino = ir.ir_start++;
- ir.ir_length--;
-
- gfs2_trans_add_bh(ip->i_gl, bh, 1);
- gfs2_inum_range_out(&ir, bh->b_data + sizeof(struct gfs2_dinode));
-
-out_brelse:
- brelse(bh);
-out_end_trans:
- mutex_unlock(&sdp->sd_inum_mutex);
- gfs2_trans_end(sdp);
-out:
- gfs2_glock_dq_uninit(&gh);
- return error;
-}
-
-static int pick_formal_ino(struct gfs2_sbd *sdp, u64 *inum)
-{
- int error;
-
- error = pick_formal_ino_1(sdp, inum);
- if (error <= 0)
- return error;
-
- error = pick_formal_ino_2(sdp, inum);
-
- return error;
-}
-
/**
* create_ok - OK to create a new on-disk inode here?
* @dip: Directory in which dinode is to be created
@@ -731,7 +598,7 @@ static int alloc_dinode(struct gfs2_inode *dip, u64 *no_addr, u64 *generation)
if (error)
goto out_ipreserv;
- *no_addr = gfs2_alloc_di(dip, generation);
+ error = gfs2_alloc_di(dip, no_addr, generation);
gfs2_trans_end(sdp);
@@ -924,7 +791,6 @@ static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip)
size_t len;
void *value;
char *name;
- struct gfs2_ea_request er;
err = security_inode_init_security(&ip->i_inode, &dip->i_inode,
&name, &value, &len);
@@ -935,16 +801,7 @@ static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip)
return err;
}
- memset(&er, 0, sizeof(struct gfs2_ea_request));
-
- er.er_type = GFS2_EATYPE_SECURITY;
- er.er_name = name;
- er.er_data = value;
- er.er_name_len = strlen(name);
- er.er_data_len = len;
-
- err = gfs2_ea_set_i(ip, &er);
-
+ err = gfs2_xattr_set(&ip->i_inode, GFS2_EATYPE_SECURITY, name, value, len, 0);
kfree(value);
kfree(name);
@@ -991,13 +848,10 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
if (error)
goto fail_gunlock;
- error = pick_formal_ino(sdp, &inum.no_formal_ino);
- if (error)
- goto fail_gunlock;
-
error = alloc_dinode(dip, &inum.no_addr, &generation);
if (error)
goto fail_gunlock;
+ inum.no_formal_ino = generation;
error = gfs2_glock_nq_num(sdp, inum.no_addr, &gfs2_inode_glops,
LM_ST_EXCLUSIVE, GL_SKIP, ghs + 1);
@@ -1008,9 +862,8 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
if (error)
goto fail_gunlock2;
- inode = gfs2_inode_lookup(dir->i_sb, IF2DT(mode),
- inum.no_addr,
- inum.no_formal_ino, 0);
+ inode = gfs2_inode_lookup(dir->i_sb, IF2DT(mode), inum.no_addr,
+ inum.no_formal_ino, 0);
if (IS_ERR(inode))
goto fail_gunlock2;
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 7bc3c45cd676..52fb6c048981 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -84,7 +84,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
gfs2_tune_init(&sdp->sd_tune);
- mutex_init(&sdp->sd_inum_mutex);
spin_lock_init(&sdp->sd_statfs_spin);
spin_lock_init(&sdp->sd_rindex_spin);
@@ -833,21 +832,12 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo)
if (error)
goto fail;
- /* Read in the master inode number inode */
- sdp->sd_inum_inode = gfs2_lookup_simple(master, "inum");
- if (IS_ERR(sdp->sd_inum_inode)) {
- error = PTR_ERR(sdp->sd_inum_inode);
- fs_err(sdp, "can't read in inum inode: %d\n", error);
- goto fail_journal;
- }
-
-
/* Read in the master statfs inode */
sdp->sd_statfs_inode = gfs2_lookup_simple(master, "statfs");
if (IS_ERR(sdp->sd_statfs_inode)) {
error = PTR_ERR(sdp->sd_statfs_inode);
fs_err(sdp, "can't read in statfs inode: %d\n", error);
- goto fail_inum;
+ goto fail_journal;
}
/* Read in the resource index inode */
@@ -876,8 +866,6 @@ fail_rindex:
iput(sdp->sd_rindex);
fail_statfs:
iput(sdp->sd_statfs_inode);
-fail_inum:
- iput(sdp->sd_inum_inode);
fail_journal:
init_journal(sdp, UNDO);
fail:
@@ -905,20 +893,12 @@ static int init_per_node(struct gfs2_sbd *sdp, int undo)
return error;
}
- sprintf(buf, "inum_range%u", sdp->sd_jdesc->jd_jid);
- sdp->sd_ir_inode = gfs2_lookup_simple(pn, buf);
- if (IS_ERR(sdp->sd_ir_inode)) {
- error = PTR_ERR(sdp->sd_ir_inode);
- fs_err(sdp, "can't find local \"ir\" file: %d\n", error);
- goto fail;
- }
-
sprintf(buf, "statfs_change%u", sdp->sd_jdesc->jd_jid);
sdp->sd_sc_inode = gfs2_lookup_simple(pn, buf);
if (IS_ERR(sdp->sd_sc_inode)) {
error = PTR_ERR(sdp->sd_sc_inode);
fs_err(sdp, "can't find local \"sc\" file: %d\n", error);
- goto fail_ir_i;
+ goto fail;
}
sprintf(buf, "quota_change%u", sdp->sd_jdesc->jd_jid);
@@ -932,27 +912,16 @@ static int init_per_node(struct gfs2_sbd *sdp, int undo)
iput(pn);
pn = NULL;
- ip = GFS2_I(sdp->sd_ir_inode);
- error = gfs2_glock_nq_init(ip->i_gl,
- LM_ST_EXCLUSIVE, 0,
- &sdp->sd_ir_gh);
- if (error) {
- fs_err(sdp, "can't lock local \"ir\" file: %d\n", error);
- goto fail_qc_i;
- }
-
ip = GFS2_I(sdp->sd_sc_inode);
- error = gfs2_glock_nq_init(ip->i_gl,
- LM_ST_EXCLUSIVE, 0,
+ error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0,
&sdp->sd_sc_gh);
if (error) {
fs_err(sdp, "can't lock local \"sc\" file: %d\n", error);
- goto fail_ir_gh;
+ goto fail_qc_i;
}
ip = GFS2_I(sdp->sd_qc_inode);
- error = gfs2_glock_nq_init(ip->i_gl,
- LM_ST_EXCLUSIVE, 0,
+ error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0,
&sdp->sd_qc_gh);
if (error) {
fs_err(sdp, "can't lock local \"qc\" file: %d\n", error);
@@ -965,14 +934,10 @@ fail_qc_gh:
gfs2_glock_dq_uninit(&sdp->sd_qc_gh);
fail_ut_gh:
gfs2_glock_dq_uninit(&sdp->sd_sc_gh);
-fail_ir_gh:
- gfs2_glock_dq_uninit(&sdp->sd_ir_gh);
fail_qc_i:
iput(sdp->sd_qc_inode);
fail_ut_i:
iput(sdp->sd_sc_inode);
-fail_ir_i:
- iput(sdp->sd_ir_inode);
fail:
if (pn)
iput(pn);
@@ -1063,7 +1028,6 @@ static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
ls->ls_ops = lm;
ls->ls_first = 1;
- ls->ls_id = 0;
for (options = args->ar_hostdata; (o = strsep(&options, ":")); ) {
substring_t tmp[MAX_OPT_ARGS];
@@ -1081,10 +1045,7 @@ static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
ls->ls_jid = option;
break;
case Opt_id:
- ret = match_int(&tmp[0], &option);
- if (ret)
- goto hostdata_error;
- ls->ls_id = option;
+ /* Obsolete, but left for backward compat purposes */
break;
case Opt_first:
ret = match_int(&tmp[0], &option);
@@ -1133,6 +1094,17 @@ void gfs2_lm_unmount(struct gfs2_sbd *sdp)
lm->lm_unmount(sdp);
}
+void gfs2_online_uevent(struct gfs2_sbd *sdp)
+{
+ struct super_block *sb = sdp->sd_vfs;
+ char ro[20];
+ char spectator[20];
+ char *envp[] = { ro, spectator, NULL };
+ sprintf(ro, "RDONLY=%d", (sb->s_flags & MS_RDONLY) ? 1 : 0);
+ sprintf(spectator, "SPECTATOR=%d", sdp->sd_args.ar_spectator ? 1 : 0);
+ kobject_uevent_env(&sdp->sd_kobj, KOBJ_ONLINE, envp);
+}
+
/**
* fill_super - Read in superblock
* @sb: The VFS superblock
@@ -1157,6 +1129,7 @@ static int fill_super(struct super_block *sb, void *data, int silent)
sdp->sd_args.ar_quota = GFS2_QUOTA_DEFAULT;
sdp->sd_args.ar_data = GFS2_DATA_DEFAULT;
sdp->sd_args.ar_commit = 60;
+ sdp->sd_args.ar_errors = GFS2_ERRORS_DEFAULT;
error = gfs2_mount_args(sdp, &sdp->sd_args, data);
if (error) {
@@ -1174,6 +1147,7 @@ static int fill_super(struct super_block *sb, void *data, int silent)
sb->s_magic = GFS2_MAGIC;
sb->s_op = &gfs2_super_ops;
sb->s_export_op = &gfs2_export_ops;
+ sb->s_xattr = gfs2_xattr_handlers;
sb->s_time_gran = 1;
sb->s_maxbytes = MAX_LFS_FILESIZE;
@@ -1236,7 +1210,7 @@ static int fill_super(struct super_block *sb, void *data, int silent)
}
gfs2_glock_dq_uninit(&mount_gh);
-
+ gfs2_online_uevent(sdp);
return 0;
fail_threads:
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index f8bd20baf99c..c3ac18054057 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -26,8 +26,7 @@
#include "acl.h"
#include "bmap.h"
#include "dir.h"
-#include "eaops.h"
-#include "eattr.h"
+#include "xattr.h"
#include "glock.h"
#include "inode.h"
#include "meta_io.h"
@@ -349,7 +348,7 @@ static int gfs2_unlink(struct inode *dir, struct dentry *dentry)
error = gfs2_trans_begin(sdp, 2*RES_DINODE + RES_LEAF + RES_RG_BIT, 0);
if (error)
- goto out_rgrp;
+ goto out_gunlock;
error = gfs2_dir_del(dip, &dentry->d_name);
if (error)
@@ -1302,60 +1301,53 @@ static int gfs2_setxattr(struct dentry *dentry, const char *name,
const void *data, size_t size, int flags)
{
struct inode *inode = dentry->d_inode;
- struct gfs2_ea_request er;
-
- memset(&er, 0, sizeof(struct gfs2_ea_request));
- er.er_type = gfs2_ea_name2type(name, &er.er_name);
- if (er.er_type == GFS2_EATYPE_UNUSED)
- return -EOPNOTSUPP;
- er.er_data = (char *)data;
- er.er_name_len = strlen(er.er_name);
- er.er_data_len = size;
- er.er_flags = flags;
-
- gfs2_assert_warn(GFS2_SB(inode), !(er.er_flags & GFS2_ERF_MODE));
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_holder gh;
+ int ret;
- return gfs2_ea_set(GFS2_I(inode), &er);
+ gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+ ret = gfs2_glock_nq(&gh);
+ if (ret == 0) {
+ ret = generic_setxattr(dentry, name, data, size, flags);
+ gfs2_glock_dq(&gh);
+ }
+ gfs2_holder_uninit(&gh);
+ return ret;
}
static ssize_t gfs2_getxattr(struct dentry *dentry, const char *name,
void *data, size_t size)
{
- struct gfs2_ea_request er;
-
- memset(&er, 0, sizeof(struct gfs2_ea_request));
- er.er_type = gfs2_ea_name2type(name, &er.er_name);
- if (er.er_type == GFS2_EATYPE_UNUSED)
- return -EOPNOTSUPP;
- er.er_data = data;
- er.er_name_len = strlen(er.er_name);
- er.er_data_len = size;
-
- return gfs2_ea_get(GFS2_I(dentry->d_inode), &er);
-}
-
-static ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size)
-{
- struct gfs2_ea_request er;
-
- memset(&er, 0, sizeof(struct gfs2_ea_request));
- er.er_data = (size) ? buffer : NULL;
- er.er_data_len = size;
+ struct inode *inode = dentry->d_inode;
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_holder gh;
+ int ret;
- return gfs2_ea_list(GFS2_I(dentry->d_inode), &er);
+ gfs2_holder_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh);
+ ret = gfs2_glock_nq(&gh);
+ if (ret == 0) {
+ ret = generic_getxattr(dentry, name, data, size);
+ gfs2_glock_dq(&gh);
+ }
+ gfs2_holder_uninit(&gh);
+ return ret;
}
static int gfs2_removexattr(struct dentry *dentry, const char *name)
{
- struct gfs2_ea_request er;
-
- memset(&er, 0, sizeof(struct gfs2_ea_request));
- er.er_type = gfs2_ea_name2type(name, &er.er_name);
- if (er.er_type == GFS2_EATYPE_UNUSED)
- return -EOPNOTSUPP;
- er.er_name_len = strlen(er.er_name);
+ struct inode *inode = dentry->d_inode;
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_holder gh;
+ int ret;
- return gfs2_ea_remove(GFS2_I(dentry->d_inode), &er);
+ gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+ ret = gfs2_glock_nq(&gh);
+ if (ret == 0) {
+ ret = generic_removexattr(dentry, name);
+ gfs2_glock_dq(&gh);
+ }
+ gfs2_holder_uninit(&gh);
+ return ret;
}
static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index fba795798d3a..8f1cfb02a6cb 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -179,7 +179,7 @@ static inline u64 gfs2_bit_search(const __le64 *ptr, u64 mask, u8 state)
* always aligned to a 64 bit boundary.
*
* The size of the buffer is in bytes, but is it assumed that it is
- * always ok to to read a complete multiple of 64 bits at the end
+ * always ok to read a complete multiple of 64 bits at the end
* of the block in case the end is no aligned to a natural boundary.
*
* Return: the block number (bitmap buffer scope) that was found
@@ -857,7 +857,8 @@ static void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
goto start_new_extent;
if ((start + nr_sects) != blk) {
rv = blkdev_issue_discard(bdev, start,
- nr_sects, GFP_NOFS);
+ nr_sects, GFP_NOFS,
+ DISCARD_FL_BARRIER);
if (rv)
goto fail;
nr_sects = 0;
@@ -871,7 +872,8 @@ start_new_extent:
}
}
if (nr_sects) {
- rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS);
+ rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS,
+ DISCARD_FL_BARRIER);
if (rv)
goto fail;
}
@@ -1256,7 +1258,7 @@ void gfs2_inplace_release(struct gfs2_inode *ip)
* Returns: The block type (GFS2_BLKST_*)
*/
-unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block)
+static unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block)
{
struct gfs2_bitmap *bi = NULL;
u32 length, rgrp_block, buf_block;
@@ -1459,6 +1461,16 @@ int gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl)
return 0;
}
+static void gfs2_rgrp_error(struct gfs2_rgrpd *rgd)
+{
+ struct gfs2_sbd *sdp = rgd->rd_sbd;
+ fs_warn(sdp, "rgrp %llu has an error, marking it readonly until umount\n",
+ (unsigned long long)rgd->rd_addr);
+ fs_warn(sdp, "umount on all nodes and run fsck.gfs2 to fix the error\n");
+ gfs2_rgrp_dump(NULL, rgd->rd_gl);
+ rgd->rd_flags |= GFS2_RDF_ERROR;
+}
+
/**
* gfs2_alloc_block - Allocate one or more blocks
* @ip: the inode to allocate the block for
@@ -1520,22 +1532,20 @@ int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n)
return 0;
rgrp_error:
- fs_warn(sdp, "rgrp %llu has an error, marking it readonly until umount\n",
- (unsigned long long)rgd->rd_addr);
- fs_warn(sdp, "umount on all nodes and run fsck.gfs2 to fix the error\n");
- gfs2_rgrp_dump(NULL, rgd->rd_gl);
- rgd->rd_flags |= GFS2_RDF_ERROR;
+ gfs2_rgrp_error(rgd);
return -EIO;
}
/**
* gfs2_alloc_di - Allocate a dinode
* @dip: the directory that the inode is going in
+ * @bn: the block number which is allocated
+ * @generation: the generation number of the inode
*
- * Returns: the block allocated
+ * Returns: 0 on success or error
*/
-u64 gfs2_alloc_di(struct gfs2_inode *dip, u64 *generation)
+int gfs2_alloc_di(struct gfs2_inode *dip, u64 *bn, u64 *generation)
{
struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
struct gfs2_alloc *al = dip->i_alloc;
@@ -1546,16 +1556,21 @@ u64 gfs2_alloc_di(struct gfs2_inode *dip, u64 *generation)
blk = rgblk_search(rgd, rgd->rd_last_alloc,
GFS2_BLKST_FREE, GFS2_BLKST_DINODE, &n);
- BUG_ON(blk == BFITNOENT);
- rgd->rd_last_alloc = blk;
+ /* Since all blocks are reserved in advance, this shouldn't happen */
+ if (blk == BFITNOENT)
+ goto rgrp_error;
+ rgd->rd_last_alloc = blk;
block = rgd->rd_data0 + blk;
+ if (rgd->rd_free == 0)
+ goto rgrp_error;
- gfs2_assert_withdraw(sdp, rgd->rd_free);
rgd->rd_free--;
rgd->rd_dinodes++;
*generation = rgd->rd_igeneration++;
+ if (*generation == 0)
+ *generation = rgd->rd_igeneration++;
gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
@@ -1568,7 +1583,12 @@ u64 gfs2_alloc_di(struct gfs2_inode *dip, u64 *generation)
rgd->rd_free_clone--;
spin_unlock(&sdp->sd_rindex_spin);
trace_gfs2_block_alloc(dip, block, 1, GFS2_BLKST_DINODE);
- return block;
+ *bn = block;
+ return 0;
+
+rgrp_error:
+ gfs2_rgrp_error(rgd);
+ return -EIO;
}
/**
@@ -1676,6 +1696,46 @@ void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip)
}
/**
+ * gfs2_check_blk_type - Check the type of a block
+ * @sdp: The superblock
+ * @no_addr: The block number to check
+ * @type: The block type we are looking for
+ *
+ * Returns: 0 if the block type matches the expected type
+ * -ESTALE if it doesn't match
+ * or -ve errno if something went wrong while checking
+ */
+
+int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr, unsigned int type)
+{
+ struct gfs2_rgrpd *rgd;
+ struct gfs2_holder ri_gh, rgd_gh;
+ int error;
+
+ error = gfs2_rindex_hold(sdp, &ri_gh);
+ if (error)
+ goto fail;
+
+ error = -EINVAL;
+ rgd = gfs2_blk2rgrpd(sdp, no_addr);
+ if (!rgd)
+ goto fail_rindex;
+
+ error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_SHARED, 0, &rgd_gh);
+ if (error)
+ goto fail_rindex;
+
+ if (gfs2_get_block_type(rgd, no_addr) != type)
+ error = -ESTALE;
+
+ gfs2_glock_dq_uninit(&rgd_gh);
+fail_rindex:
+ gfs2_glock_dq_uninit(&ri_gh);
+fail:
+ return error;
+}
+
+/**
* gfs2_rlist_add - add a RG to a list of RGs
* @sdp: the filesystem
* @rlist: the list of resource groups
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index 1e76ff0f3e00..b4106ddaaa98 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -44,15 +44,15 @@ gfs2_inplace_reserve_i((ip), __FILE__, __LINE__)
extern void gfs2_inplace_release(struct gfs2_inode *ip);
-extern unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block);
-
extern int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n);
-extern u64 gfs2_alloc_di(struct gfs2_inode *ip, u64 *generation);
+extern int gfs2_alloc_di(struct gfs2_inode *ip, u64 *bn, u64 *generation);
extern void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen);
extern void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen);
extern void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip);
extern void gfs2_unlink_di(struct inode *inode);
+extern int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr,
+ unsigned int type);
struct gfs2_rgrp_list {
unsigned int rl_rgrps;
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index f522bb017973..0ec3ec672de1 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -38,7 +38,7 @@
#include "trans.h"
#include "util.h"
#include "sys.h"
-#include "eattr.h"
+#include "xattr.h"
#define args_neq(a1, a2, x) ((a1)->ar_##x != (a2)->ar_##x)
@@ -68,6 +68,8 @@ enum {
Opt_discard,
Opt_nodiscard,
Opt_commit,
+ Opt_err_withdraw,
+ Opt_err_panic,
Opt_error,
};
@@ -97,6 +99,8 @@ static const match_table_t tokens = {
{Opt_discard, "discard"},
{Opt_nodiscard, "nodiscard"},
{Opt_commit, "commit=%d"},
+ {Opt_err_withdraw, "errors=withdraw"},
+ {Opt_err_panic, "errors=panic"},
{Opt_error, NULL}
};
@@ -152,6 +156,11 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *options)
args->ar_localcaching = 1;
break;
case Opt_debug:
+ if (args->ar_errors == GFS2_ERRORS_PANIC) {
+ fs_info(sdp, "-o debug and -o errors=panic "
+ "are mutually exclusive.\n");
+ return -EINVAL;
+ }
args->ar_debug = 1;
break;
case Opt_nodebug:
@@ -205,6 +214,17 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *options)
return rv ? rv : -EINVAL;
}
break;
+ case Opt_err_withdraw:
+ args->ar_errors = GFS2_ERRORS_WITHDRAW;
+ break;
+ case Opt_err_panic:
+ if (args->ar_debug) {
+ fs_info(sdp, "-o debug and -o errors=panic "
+ "are mutually exclusive.\n");
+ return -EINVAL;
+ }
+ args->ar_errors = GFS2_ERRORS_PANIC;
+ break;
case Opt_error:
default:
fs_info(sdp, "invalid mount option: %s\n", o);
@@ -768,7 +788,6 @@ restart:
/* Release stuff */
iput(sdp->sd_jindex);
- iput(sdp->sd_inum_inode);
iput(sdp->sd_statfs_inode);
iput(sdp->sd_rindex);
iput(sdp->sd_quota_inode);
@@ -779,10 +798,8 @@ restart:
if (!sdp->sd_args.ar_spectator) {
gfs2_glock_dq_uninit(&sdp->sd_journal_gh);
gfs2_glock_dq_uninit(&sdp->sd_jinode_gh);
- gfs2_glock_dq_uninit(&sdp->sd_ir_gh);
gfs2_glock_dq_uninit(&sdp->sd_sc_gh);
gfs2_glock_dq_uninit(&sdp->sd_qc_gh);
- iput(sdp->sd_ir_inode);
iput(sdp->sd_sc_inode);
iput(sdp->sd_qc_inode);
}
@@ -1084,6 +1101,7 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
gt->gt_log_flush_secs = args.ar_commit;
spin_unlock(&gt->gt_spin);
+ gfs2_online_uevent(sdp);
return 0;
}
@@ -1225,6 +1243,22 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
lfsecs = sdp->sd_tune.gt_log_flush_secs;
if (lfsecs != 60)
seq_printf(s, ",commit=%d", lfsecs);
+ if (args->ar_errors != GFS2_ERRORS_DEFAULT) {
+ const char *state;
+
+ switch (args->ar_errors) {
+ case GFS2_ERRORS_WITHDRAW:
+ state = "withdraw";
+ break;
+ case GFS2_ERRORS_PANIC:
+ state = "panic";
+ break;
+ default:
+ state = "unknown";
+ break;
+ }
+ seq_printf(s, ",errors=%s", state);
+ }
return 0;
}
@@ -1252,6 +1286,10 @@ static void gfs2_delete_inode(struct inode *inode)
goto out;
}
+ error = gfs2_check_blk_type(sdp, ip->i_no_addr, GFS2_BLKST_UNLINKED);
+ if (error)
+ goto out_truncate;
+
gfs2_glock_dq_wait(&ip->i_iopen_gh);
gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &ip->i_iopen_gh);
error = gfs2_glock_nq(&ip->i_iopen_gh);
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
index 22e0417ed996..235db3682885 100644
--- a/fs/gfs2/super.h
+++ b/fs/gfs2/super.h
@@ -25,7 +25,7 @@ static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp)
return x;
}
-void gfs2_jindex_free(struct gfs2_sbd *sdp);
+extern void gfs2_jindex_free(struct gfs2_sbd *sdp);
extern int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *data);
@@ -36,7 +36,7 @@ extern int gfs2_lookup_in_master_dir(struct gfs2_sbd *sdp, char *filename,
struct gfs2_inode **ipp);
extern int gfs2_make_fs_rw(struct gfs2_sbd *sdp);
-
+extern void gfs2_online_uevent(struct gfs2_sbd *sdp);
extern int gfs2_statfs_init(struct gfs2_sbd *sdp);
extern void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free,
s64 dinodes);
@@ -54,6 +54,7 @@ extern struct file_system_type gfs2meta_fs_type;
extern const struct export_operations gfs2_export_ops;
extern const struct super_operations gfs2_super_ops;
extern const struct dentry_operations gfs2_dops;
+extern struct xattr_handler *gfs2_xattr_handlers[];
#endif /* __SUPER_DOT_H__ */
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 23419dc3027b..446329728d52 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -16,6 +16,7 @@
#include <linux/kobject.h>
#include <asm/uaccess.h>
#include <linux/gfs2_ondisk.h>
+#include <linux/genhd.h>
#include "gfs2.h"
#include "incore.h"
@@ -319,12 +320,6 @@ static ssize_t block_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
return ret;
}
-static ssize_t lkid_show(struct gfs2_sbd *sdp, char *buf)
-{
- struct lm_lockstruct *ls = &sdp->sd_lockstruct;
- return sprintf(buf, "%u\n", ls->ls_id);
-}
-
static ssize_t lkfirst_show(struct gfs2_sbd *sdp, char *buf)
{
struct lm_lockstruct *ls = &sdp->sd_lockstruct;
@@ -386,22 +381,20 @@ static ssize_t jid_show(struct gfs2_sbd *sdp, char *buf)
#define GDLM_ATTR(_name,_mode,_show,_store) \
static struct gfs2_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store)
-GDLM_ATTR(proto_name, 0444, proto_name_show, NULL);
-GDLM_ATTR(block, 0644, block_show, block_store);
-GDLM_ATTR(withdraw, 0644, withdraw_show, withdraw_store);
-GDLM_ATTR(id, 0444, lkid_show, NULL);
-GDLM_ATTR(jid, 0444, jid_show, NULL);
-GDLM_ATTR(first, 0444, lkfirst_show, NULL);
-GDLM_ATTR(first_done, 0444, first_done_show, NULL);
-GDLM_ATTR(recover, 0200, NULL, recover_store);
-GDLM_ATTR(recover_done, 0444, recover_done_show, NULL);
-GDLM_ATTR(recover_status, 0444, recover_status_show, NULL);
+GDLM_ATTR(proto_name, 0444, proto_name_show, NULL);
+GDLM_ATTR(block, 0644, block_show, block_store);
+GDLM_ATTR(withdraw, 0644, withdraw_show, withdraw_store);
+GDLM_ATTR(jid, 0444, jid_show, NULL);
+GDLM_ATTR(first, 0444, lkfirst_show, NULL);
+GDLM_ATTR(first_done, 0444, first_done_show, NULL);
+GDLM_ATTR(recover, 0600, NULL, recover_store);
+GDLM_ATTR(recover_done, 0444, recover_done_show, NULL);
+GDLM_ATTR(recover_status, 0444, recover_status_show, NULL);
static struct attribute *lock_module_attrs[] = {
&gdlm_attr_proto_name.attr,
&gdlm_attr_block.attr,
&gdlm_attr_withdraw.attr,
- &gdlm_attr_id.attr,
&gdlm_attr_jid.attr,
&gdlm_attr_first.attr,
&gdlm_attr_first_done.attr,
@@ -519,7 +512,14 @@ static struct attribute_group lock_module_group = {
int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
{
+ struct super_block *sb = sdp->sd_vfs;
int error;
+ char ro[20];
+ char spectator[20];
+ char *envp[] = { ro, spectator, NULL };
+
+ sprintf(ro, "RDONLY=%d", (sb->s_flags & MS_RDONLY) ? 1 : 0);
+ sprintf(spectator, "SPECTATOR=%d", sdp->sd_args.ar_spectator ? 1 : 0);
sdp->sd_kobj.kset = gfs2_kset;
error = kobject_init_and_add(&sdp->sd_kobj, &gfs2_ktype, NULL,
@@ -535,9 +535,17 @@ int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
if (error)
goto fail_tune;
- kobject_uevent(&sdp->sd_kobj, KOBJ_ADD);
+ error = sysfs_create_link(&sdp->sd_kobj,
+ &disk_to_dev(sb->s_bdev->bd_disk)->kobj,
+ "device");
+ if (error)
+ goto fail_lock_module;
+
+ kobject_uevent_env(&sdp->sd_kobj, KOBJ_ADD, envp);
return 0;
+fail_lock_module:
+ sysfs_remove_group(&sdp->sd_kobj, &lock_module_group);
fail_tune:
sysfs_remove_group(&sdp->sd_kobj, &tune_group);
fail_reg:
@@ -549,12 +557,12 @@ fail:
void gfs2_sys_fs_del(struct gfs2_sbd *sdp)
{
+ sysfs_remove_link(&sdp->sd_kobj, "device");
sysfs_remove_group(&sdp->sd_kobj, &tune_group);
sysfs_remove_group(&sdp->sd_kobj, &lock_module_group);
kobject_put(&sdp->sd_kobj);
}
-
static int gfs2_uevent(struct kset *kset, struct kobject *kobj,
struct kobj_uevent_env *env)
{
@@ -563,6 +571,8 @@ static int gfs2_uevent(struct kset *kset, struct kobject *kobj,
add_uevent_var(env, "LOCKTABLE=%s", sdp->sd_table_name);
add_uevent_var(env, "LOCKPROTO=%s", sdp->sd_proto_name);
+ if (!sdp->sd_args.ar_spectator)
+ add_uevent_var(env, "JOURNALID=%u", sdp->sd_lockstruct.ls_jid);
if (gfs2_uuid_valid(uuid)) {
add_uevent_var(env, "UUID=%02X%02X%02X%02X-%02X%02X-%02X%02X-"
"%02X%02X-%02X%02X%02X%02X%02X%02X",
@@ -578,7 +588,6 @@ static struct kset_uevent_ops gfs2_uevent_ops = {
.uevent = gfs2_uevent,
};
-
int gfs2_sys_init(void)
{
gfs2_kset = kset_create_and_add("gfs2", &gfs2_uevent_ops, fs_kobj);
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index 9d12b1118ba0..f6a7efa34eb9 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -38,24 +38,30 @@ int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...)
const struct lm_lockops *lm = ls->ls_ops;
va_list args;
- if (test_and_set_bit(SDF_SHUTDOWN, &sdp->sd_flags))
+ if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW &&
+ test_and_set_bit(SDF_SHUTDOWN, &sdp->sd_flags))
return 0;
va_start(args, fmt);
vprintk(fmt, args);
va_end(args);
- fs_err(sdp, "about to withdraw this file system\n");
- BUG_ON(sdp->sd_args.ar_debug);
+ if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW) {
+ fs_err(sdp, "about to withdraw this file system\n");
+ BUG_ON(sdp->sd_args.ar_debug);
- kobject_uevent(&sdp->sd_kobj, KOBJ_OFFLINE);
+ kobject_uevent(&sdp->sd_kobj, KOBJ_OFFLINE);
- if (lm->lm_unmount) {
- fs_err(sdp, "telling LM to unmount\n");
- lm->lm_unmount(sdp);
+ if (lm->lm_unmount) {
+ fs_err(sdp, "telling LM to unmount\n");
+ lm->lm_unmount(sdp);
+ }
+ fs_err(sdp, "withdrawn\n");
+ dump_stack();
}
- fs_err(sdp, "withdrawn\n");
- dump_stack();
+
+ if (sdp->sd_args.ar_errors == GFS2_ERRORS_PANIC)
+ panic("GFS2: fsid=%s: panic requested.\n", sdp->sd_fsname);
return -1;
}
@@ -93,17 +99,24 @@ int gfs2_assert_warn_i(struct gfs2_sbd *sdp, char *assertion,
gfs2_tune_get(sdp, gt_complain_secs) * HZ))
return -2;
- printk(KERN_WARNING
- "GFS2: fsid=%s: warning: assertion \"%s\" failed\n"
- "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
- sdp->sd_fsname, assertion,
- sdp->sd_fsname, function, file, line);
+ if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW)
+ printk(KERN_WARNING
+ "GFS2: fsid=%s: warning: assertion \"%s\" failed\n"
+ "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
+ sdp->sd_fsname, assertion,
+ sdp->sd_fsname, function, file, line);
if (sdp->sd_args.ar_debug)
BUG();
else
dump_stack();
+ if (sdp->sd_args.ar_errors == GFS2_ERRORS_PANIC)
+ panic("GFS2: fsid=%s: warning: assertion \"%s\" failed\n"
+ "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
+ sdp->sd_fsname, assertion,
+ sdp->sd_fsname, function, file, line);
+
sdp->sd_last_warning = jiffies;
return -1;
diff --git a/fs/gfs2/eattr.c b/fs/gfs2/xattr.c
index 07ea9529adda..8a0f8ef6ee27 100644
--- a/fs/gfs2/eattr.c
+++ b/fs/gfs2/xattr.c
@@ -18,8 +18,7 @@
#include "gfs2.h"
#include "incore.h"
#include "acl.h"
-#include "eaops.h"
-#include "eattr.h"
+#include "xattr.h"
#include "glock.h"
#include "inode.h"
#include "meta_io.h"
@@ -38,26 +37,32 @@
* Returns: 1 if the EA should be stuffed
*/
-static int ea_calc_size(struct gfs2_sbd *sdp, struct gfs2_ea_request *er,
+static int ea_calc_size(struct gfs2_sbd *sdp, unsigned int nsize, size_t dsize,
unsigned int *size)
{
- *size = GFS2_EAREQ_SIZE_STUFFED(er);
- if (*size <= sdp->sd_jbsize)
+ unsigned int jbsize = sdp->sd_jbsize;
+
+ /* Stuffed */
+ *size = ALIGN(sizeof(struct gfs2_ea_header) + nsize + dsize, 8);
+
+ if (*size <= jbsize)
return 1;
- *size = GFS2_EAREQ_SIZE_UNSTUFFED(sdp, er);
+ /* Unstuffed */
+ *size = ALIGN(sizeof(struct gfs2_ea_header) + nsize +
+ (sizeof(__be64) * DIV_ROUND_UP(dsize, jbsize)), 8);
return 0;
}
-static int ea_check_size(struct gfs2_sbd *sdp, struct gfs2_ea_request *er)
+static int ea_check_size(struct gfs2_sbd *sdp, unsigned int nsize, size_t dsize)
{
unsigned int size;
- if (er->er_data_len > GFS2_EA_MAX_DATA_LEN)
+ if (dsize > GFS2_EA_MAX_DATA_LEN)
return -ERANGE;
- ea_calc_size(sdp, er, &size);
+ ea_calc_size(sdp, nsize, dsize, &size);
/* This can only happen with 512 byte blocks */
if (size > sdp->sd_jbsize)
@@ -151,7 +156,9 @@ out:
}
struct ea_find {
- struct gfs2_ea_request *ef_er;
+ int type;
+ const char *name;
+ size_t namel;
struct gfs2_ea_location *ef_el;
};
@@ -160,14 +167,13 @@ static int ea_find_i(struct gfs2_inode *ip, struct buffer_head *bh,
void *private)
{
struct ea_find *ef = private;
- struct gfs2_ea_request *er = ef->ef_er;
if (ea->ea_type == GFS2_EATYPE_UNUSED)
return 0;
- if (ea->ea_type == er->er_type) {
- if (ea->ea_name_len == er->er_name_len &&
- !memcmp(GFS2_EA2NAME(ea), er->er_name, ea->ea_name_len)) {
+ if (ea->ea_type == ef->type) {
+ if (ea->ea_name_len == ef->namel &&
+ !memcmp(GFS2_EA2NAME(ea), ef->name, ea->ea_name_len)) {
struct gfs2_ea_location *el = ef->ef_el;
get_bh(bh);
el->el_bh = bh;
@@ -180,13 +186,15 @@ static int ea_find_i(struct gfs2_inode *ip, struct buffer_head *bh,
return 0;
}
-int gfs2_ea_find(struct gfs2_inode *ip, struct gfs2_ea_request *er,
+int gfs2_ea_find(struct gfs2_inode *ip, int type, const char *name,
struct gfs2_ea_location *el)
{
struct ea_find ef;
int error;
- ef.ef_er = er;
+ ef.type = type;
+ ef.name = name;
+ ef.namel = strlen(name);
ef.ef_el = el;
memset(el, 0, sizeof(struct gfs2_ea_location));
@@ -344,6 +352,20 @@ struct ea_list {
unsigned int ei_size;
};
+static inline unsigned int gfs2_ea_strlen(struct gfs2_ea_header *ea)
+{
+ switch (ea->ea_type) {
+ case GFS2_EATYPE_USR:
+ return 5 + ea->ea_name_len + 1;
+ case GFS2_EATYPE_SYS:
+ return 7 + ea->ea_name_len + 1;
+ case GFS2_EATYPE_SECURITY:
+ return 9 + ea->ea_name_len + 1;
+ default:
+ return 0;
+ }
+}
+
static int ea_list_i(struct gfs2_inode *ip, struct buffer_head *bh,
struct gfs2_ea_header *ea, struct gfs2_ea_header *prev,
void *private)
@@ -392,21 +414,25 @@ static int ea_list_i(struct gfs2_inode *ip, struct buffer_head *bh,
}
/**
- * gfs2_ea_list -
- * @ip:
- * @er:
+ * gfs2_listxattr - List gfs2 extended attributes
+ * @dentry: The dentry whose inode we are interested in
+ * @buffer: The buffer to write the results
+ * @size: The size of the buffer
*
* Returns: actual size of data on success, -errno on error
*/
-int gfs2_ea_list(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size)
{
+ struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
+ struct gfs2_ea_request er;
struct gfs2_holder i_gh;
int error;
- if (!er->er_data || !er->er_data_len) {
- er->er_data = NULL;
- er->er_data_len = 0;
+ memset(&er, 0, sizeof(struct gfs2_ea_request));
+ if (size) {
+ er.er_data = buffer;
+ er.er_data_len = size;
}
error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
@@ -414,7 +440,7 @@ int gfs2_ea_list(struct gfs2_inode *ip, struct gfs2_ea_request *er)
return error;
if (ip->i_eattr) {
- struct ea_list ei = { .ei_er = er, .ei_size = 0 };
+ struct ea_list ei = { .ei_er = &er, .ei_size = 0 };
error = ea_foreach(ip, ea_list_i, &ei);
if (!error)
@@ -491,84 +517,61 @@ out:
}
int gfs2_ea_get_copy(struct gfs2_inode *ip, struct gfs2_ea_location *el,
- char *data)
+ char *data, size_t size)
{
+ int ret;
+ size_t len = GFS2_EA_DATA_LEN(el->el_ea);
+ if (len > size)
+ return -ERANGE;
+
if (GFS2_EA_IS_STUFFED(el->el_ea)) {
- memcpy(data, GFS2_EA2DATA(el->el_ea), GFS2_EA_DATA_LEN(el->el_ea));
- return 0;
- } else
- return ea_get_unstuffed(ip, el->el_ea, data);
+ memcpy(data, GFS2_EA2DATA(el->el_ea), len);
+ return len;
+ }
+ ret = ea_get_unstuffed(ip, el->el_ea, data);
+ if (ret < 0)
+ return ret;
+ return len;
}
/**
- * gfs2_ea_get_i -
- * @ip: The GFS2 inode
- * @er: The request structure
+ * gfs2_xattr_get - Get a GFS2 extended attribute
+ * @inode: The inode
+ * @type: The type of extended attribute
+ * @name: The name of the extended attribute
+ * @buffer: The buffer to write the result into
+ * @size: The size of the buffer
*
* Returns: actual size of data on success, -errno on error
*/
-int gfs2_ea_get_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+int gfs2_xattr_get(struct inode *inode, int type, const char *name,
+ void *buffer, size_t size)
{
+ struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_ea_location el;
int error;
if (!ip->i_eattr)
return -ENODATA;
+ if (strlen(name) > GFS2_EA_MAX_NAME_LEN)
+ return -EINVAL;
- error = gfs2_ea_find(ip, er, &el);
+ error = gfs2_ea_find(ip, type, name, &el);
if (error)
return error;
if (!el.el_ea)
return -ENODATA;
-
- if (er->er_data_len) {
- if (GFS2_EA_DATA_LEN(el.el_ea) > er->er_data_len)
- error = -ERANGE;
- else
- error = gfs2_ea_get_copy(ip, &el, er->er_data);
- }
- if (!error)
+ if (size)
+ error = gfs2_ea_get_copy(ip, &el, buffer, size);
+ else
error = GFS2_EA_DATA_LEN(el.el_ea);
-
brelse(el.el_bh);
return error;
}
/**
- * gfs2_ea_get -
- * @ip: The GFS2 inode
- * @er: The request structure
- *
- * Returns: actual size of data on success, -errno on error
- */
-
-int gfs2_ea_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
-{
- struct gfs2_holder i_gh;
- int error;
-
- if (!er->er_name_len ||
- er->er_name_len > GFS2_EA_MAX_NAME_LEN)
- return -EINVAL;
- if (!er->er_data || !er->er_data_len) {
- er->er_data = NULL;
- er->er_data_len = 0;
- }
-
- error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
- if (error)
- return error;
-
- error = gfs2_ea_ops[er->er_type]->eo_get(ip, er);
-
- gfs2_glock_dq_uninit(&i_gh);
-
- return error;
-}
-
-/**
* ea_alloc_blk - allocates a new block for extended attributes.
* @ip: A pointer to the inode that's getting extended attributes
* @bhp: Pointer to pointer to a struct buffer_head
@@ -713,12 +716,6 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
error = gfs2_meta_inode_buffer(ip, &dibh);
if (!error) {
- if (er->er_flags & GFS2_ERF_MODE) {
- gfs2_assert_withdraw(GFS2_SB(&ip->i_inode),
- (ip->i_inode.i_mode & S_IFMT) ==
- (er->er_mode & S_IFMT));
- ip->i_inode.i_mode = er->er_mode;
- }
ip->i_inode.i_ctime = CURRENT_TIME;
gfs2_trans_add_bh(ip->i_gl, dibh, 1);
gfs2_dinode_out(ip, dibh->b_data);
@@ -762,15 +759,23 @@ static int ea_init_i(struct gfs2_inode *ip, struct gfs2_ea_request *er,
* Returns: errno
*/
-static int ea_init(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+static int ea_init(struct gfs2_inode *ip, int type, const char *name,
+ const void *data, size_t size)
{
+ struct gfs2_ea_request er;
unsigned int jbsize = GFS2_SB(&ip->i_inode)->sd_jbsize;
unsigned int blks = 1;
- if (GFS2_EAREQ_SIZE_STUFFED(er) > jbsize)
- blks += DIV_ROUND_UP(er->er_data_len, jbsize);
+ er.er_type = type;
+ er.er_name = name;
+ er.er_name_len = strlen(name);
+ er.er_data = (void *)data;
+ er.er_data_len = size;
+
+ if (GFS2_EAREQ_SIZE_STUFFED(&er) > jbsize)
+ blks += DIV_ROUND_UP(er.er_data_len, jbsize);
- return ea_alloc_skeleton(ip, er, blks, ea_init_i, NULL);
+ return ea_alloc_skeleton(ip, &er, blks, ea_init_i, NULL);
}
static struct gfs2_ea_header *ea_split_ea(struct gfs2_ea_header *ea)
@@ -848,12 +853,6 @@ static int ea_set_simple_noalloc(struct gfs2_inode *ip, struct buffer_head *bh,
error = gfs2_meta_inode_buffer(ip, &dibh);
if (error)
goto out;
-
- if (er->er_flags & GFS2_ERF_MODE) {
- gfs2_assert_withdraw(GFS2_SB(&ip->i_inode),
- (ip->i_inode.i_mode & S_IFMT) == (er->er_mode & S_IFMT));
- ip->i_inode.i_mode = er->er_mode;
- }
ip->i_inode.i_ctime = CURRENT_TIME;
gfs2_trans_add_bh(ip->i_gl, dibh, 1);
gfs2_dinode_out(ip, dibh->b_data);
@@ -894,7 +893,8 @@ static int ea_set_simple(struct gfs2_inode *ip, struct buffer_head *bh,
int stuffed;
int error;
- stuffed = ea_calc_size(GFS2_SB(&ip->i_inode), es->es_er, &size);
+ stuffed = ea_calc_size(GFS2_SB(&ip->i_inode), es->es_er->er_name_len,
+ es->es_er->er_data_len, &size);
if (ea->ea_type == GFS2_EATYPE_UNUSED) {
if (GFS2_EA_REC_LEN(ea) < size)
@@ -1005,15 +1005,22 @@ out:
return error;
}
-static int ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er,
- struct gfs2_ea_location *el)
+static int ea_set_i(struct gfs2_inode *ip, int type, const char *name,
+ const void *value, size_t size, struct gfs2_ea_location *el)
{
+ struct gfs2_ea_request er;
struct ea_set es;
unsigned int blks = 2;
int error;
+ er.er_type = type;
+ er.er_name = name;
+ er.er_data = (void *)value;
+ er.er_name_len = strlen(name);
+ er.er_data_len = size;
+
memset(&es, 0, sizeof(struct ea_set));
- es.es_er = er;
+ es.es_er = &er;
es.es_el = el;
error = ea_foreach(ip, ea_set_simple, &es);
@@ -1024,10 +1031,10 @@ static int ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er,
if (!(ip->i_diskflags & GFS2_DIF_EA_INDIRECT))
blks++;
- if (GFS2_EAREQ_SIZE_STUFFED(er) > GFS2_SB(&ip->i_inode)->sd_jbsize)
- blks += DIV_ROUND_UP(er->er_data_len, GFS2_SB(&ip->i_inode)->sd_jbsize);
+ if (GFS2_EAREQ_SIZE_STUFFED(&er) > GFS2_SB(&ip->i_inode)->sd_jbsize)
+ blks += DIV_ROUND_UP(er.er_data_len, GFS2_SB(&ip->i_inode)->sd_jbsize);
- return ea_alloc_skeleton(ip, er, blks, ea_set_block, el);
+ return ea_alloc_skeleton(ip, &er, blks, ea_set_block, el);
}
static int ea_set_remove_unstuffed(struct gfs2_inode *ip,
@@ -1039,75 +1046,7 @@ static int ea_set_remove_unstuffed(struct gfs2_inode *ip,
GFS2_EA2NEXT(el->el_prev) == el->el_ea);
}
- return ea_remove_unstuffed(ip, el->el_bh, el->el_ea, el->el_prev,0);
-}
-
-int gfs2_ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
-{
- struct gfs2_ea_location el;
- int error;
-
- if (!ip->i_eattr) {
- if (er->er_flags & XATTR_REPLACE)
- return -ENODATA;
- return ea_init(ip, er);
- }
-
- error = gfs2_ea_find(ip, er, &el);
- if (error)
- return error;
-
- if (el.el_ea) {
- if (ip->i_diskflags & GFS2_DIF_APPENDONLY) {
- brelse(el.el_bh);
- return -EPERM;
- }
-
- error = -EEXIST;
- if (!(er->er_flags & XATTR_CREATE)) {
- int unstuffed = !GFS2_EA_IS_STUFFED(el.el_ea);
- error = ea_set_i(ip, er, &el);
- if (!error && unstuffed)
- ea_set_remove_unstuffed(ip, &el);
- }
-
- brelse(el.el_bh);
- } else {
- error = -ENODATA;
- if (!(er->er_flags & XATTR_REPLACE))
- error = ea_set_i(ip, er, NULL);
- }
-
- return error;
-}
-
-int gfs2_ea_set(struct gfs2_inode *ip, struct gfs2_ea_request *er)
-{
- struct gfs2_holder i_gh;
- int error;
-
- if (!er->er_name_len || er->er_name_len > GFS2_EA_MAX_NAME_LEN)
- return -EINVAL;
- if (!er->er_data || !er->er_data_len) {
- er->er_data = NULL;
- er->er_data_len = 0;
- }
- error = ea_check_size(GFS2_SB(&ip->i_inode), er);
- if (error)
- return error;
-
- error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
- if (error)
- return error;
-
- if (IS_IMMUTABLE(&ip->i_inode))
- error = -EPERM;
- else
- error = gfs2_ea_ops[er->er_type]->eo_set(ip, er);
-
- gfs2_glock_dq_uninit(&i_gh);
-
- return error;
+ return ea_remove_unstuffed(ip, el->el_bh, el->el_ea, el->el_prev, 0);
}
static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el)
@@ -1131,8 +1070,9 @@ static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el)
if (GFS2_EA_IS_LAST(ea))
prev->ea_flags |= GFS2_EAFLAG_LAST;
- } else
+ } else {
ea->ea_type = GFS2_EATYPE_UNUSED;
+ }
error = gfs2_meta_inode_buffer(ip, &dibh);
if (!error) {
@@ -1147,15 +1087,29 @@ static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el)
return error;
}
-int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+/**
+ * gfs2_xattr_remove - Remove a GFS2 extended attribute
+ * @inode: The inode
+ * @type: The type of the extended attribute
+ * @name: The name of the extended attribute
+ *
+ * This is not called directly by the VFS since we use the (common)
+ * scheme of making a "set with NULL data" mean a remove request. Note
+ * that this is different from a set with zero length data.
+ *
+ * Returns: 0, or errno on failure
+ */
+
+static int gfs2_xattr_remove(struct inode *inode, int type, const char *name)
{
+ struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_ea_location el;
int error;
if (!ip->i_eattr)
return -ENODATA;
- error = gfs2_ea_find(ip, er, &el);
+ error = gfs2_ea_find(ip, type, name, &el);
if (error)
return error;
if (!el.el_ea)
@@ -1164,8 +1118,7 @@ int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
if (GFS2_EA_IS_STUFFED(el.el_ea))
error = ea_remove_stuffed(ip, &el);
else
- error = ea_remove_unstuffed(ip, el.el_bh, el.el_ea, el.el_prev,
- 0);
+ error = ea_remove_unstuffed(ip, el.el_bh, el.el_ea, el.el_prev, 0);
brelse(el.el_bh);
@@ -1173,31 +1126,70 @@ int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
}
/**
- * gfs2_ea_remove - sets (or creates or replaces) an extended attribute
- * @ip: pointer to the inode of the target file
- * @er: request information
+ * gfs2_xattr_set - Set (or remove) a GFS2 extended attribute
+ * @inode: The inode
+ * @type: The type of the extended attribute
+ * @name: The name of the extended attribute
+ * @value: The value of the extended attribute (NULL for remove)
+ * @size: The size of the @value argument
+ * @flags: Create or Replace
*
- * Returns: errno
+ * See gfs2_xattr_remove() for details of the removal of xattrs.
+ *
+ * Returns: 0 or errno on failure
*/
-int gfs2_ea_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+int gfs2_xattr_set(struct inode *inode, int type, const char *name,
+ const void *value, size_t size, int flags)
{
- struct gfs2_holder i_gh;
+ struct gfs2_sbd *sdp = GFS2_SB(inode);
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_ea_location el;
+ unsigned int namel = strlen(name);
int error;
- if (!er->er_name_len || er->er_name_len > GFS2_EA_MAX_NAME_LEN)
- return -EINVAL;
+ if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
+ return -EPERM;
+ if (namel > GFS2_EA_MAX_NAME_LEN)
+ return -ERANGE;
- error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
+ if (value == NULL)
+ return gfs2_xattr_remove(inode, type, name);
+
+ if (ea_check_size(sdp, namel, size))
+ return -ERANGE;
+
+ if (!ip->i_eattr) {
+ if (flags & XATTR_REPLACE)
+ return -ENODATA;
+ return ea_init(ip, type, name, value, size);
+ }
+
+ error = gfs2_ea_find(ip, type, name, &el);
if (error)
return error;
- if (IS_IMMUTABLE(&ip->i_inode) || IS_APPEND(&ip->i_inode))
- error = -EPERM;
- else
- error = gfs2_ea_ops[er->er_type]->eo_remove(ip, er);
+ if (el.el_ea) {
+ if (ip->i_diskflags & GFS2_DIF_APPENDONLY) {
+ brelse(el.el_bh);
+ return -EPERM;
+ }
- gfs2_glock_dq_uninit(&i_gh);
+ error = -EEXIST;
+ if (!(flags & XATTR_CREATE)) {
+ int unstuffed = !GFS2_EA_IS_STUFFED(el.el_ea);
+ error = ea_set_i(ip, type, name, value, size, &el);
+ if (!error && unstuffed)
+ ea_set_remove_unstuffed(ip, &el);
+ }
+
+ brelse(el.el_bh);
+ return error;
+ }
+
+ error = -ENODATA;
+ if (!(flags & XATTR_REPLACE))
+ error = ea_set_i(ip, type, name, value, size, NULL);
return error;
}
@@ -1503,3 +1495,64 @@ out_alloc:
return error;
}
+static int gfs2_xattr_user_get(struct inode *inode, const char *name,
+ void *buffer, size_t size)
+{
+ return gfs2_xattr_get(inode, GFS2_EATYPE_USR, name, buffer, size);
+}
+
+static int gfs2_xattr_user_set(struct inode *inode, const char *name,
+ const void *value, size_t size, int flags)
+{
+ return gfs2_xattr_set(inode, GFS2_EATYPE_USR, name, value, size, flags);
+}
+
+static int gfs2_xattr_system_get(struct inode *inode, const char *name,
+ void *buffer, size_t size)
+{
+ return gfs2_xattr_get(inode, GFS2_EATYPE_SYS, name, buffer, size);
+}
+
+static int gfs2_xattr_system_set(struct inode *inode, const char *name,
+ const void *value, size_t size, int flags)
+{
+ return gfs2_xattr_set(inode, GFS2_EATYPE_SYS, name, value, size, flags);
+}
+
+static int gfs2_xattr_security_get(struct inode *inode, const char *name,
+ void *buffer, size_t size)
+{
+ return gfs2_xattr_get(inode, GFS2_EATYPE_SECURITY, name, buffer, size);
+}
+
+static int gfs2_xattr_security_set(struct inode *inode, const char *name,
+ const void *value, size_t size, int flags)
+{
+ return gfs2_xattr_set(inode, GFS2_EATYPE_SECURITY, name, value, size, flags);
+}
+
+static struct xattr_handler gfs2_xattr_user_handler = {
+ .prefix = XATTR_USER_PREFIX,
+ .get = gfs2_xattr_user_get,
+ .set = gfs2_xattr_user_set,
+};
+
+static struct xattr_handler gfs2_xattr_security_handler = {
+ .prefix = XATTR_SECURITY_PREFIX,
+ .get = gfs2_xattr_security_get,
+ .set = gfs2_xattr_security_set,
+};
+
+static struct xattr_handler gfs2_xattr_system_handler = {
+ .prefix = XATTR_SYSTEM_PREFIX,
+ .get = gfs2_xattr_system_get,
+ .set = gfs2_xattr_system_set,
+};
+
+struct xattr_handler *gfs2_xattr_handlers[] = {
+ &gfs2_xattr_user_handler,
+ &gfs2_xattr_security_handler,
+ &gfs2_xattr_system_handler,
+ NULL,
+};
+
diff --git a/fs/gfs2/eattr.h b/fs/gfs2/xattr.h
index c82dbe01d713..cbdfd7743733 100644
--- a/fs/gfs2/eattr.h
+++ b/fs/gfs2/xattr.h
@@ -19,7 +19,7 @@ struct iattr;
#define GFS2_EA_SIZE(ea) \
ALIGN(sizeof(struct gfs2_ea_header) + (ea)->ea_name_len + \
((GFS2_EA_IS_STUFFED(ea)) ? GFS2_EA_DATA_LEN(ea) : \
- (sizeof(__be64) * (ea)->ea_num_ptrs)), 8)
+ (sizeof(__be64) * (ea)->ea_num_ptrs)), 8)
#define GFS2_EA_IS_STUFFED(ea) (!(ea)->ea_num_ptrs)
#define GFS2_EA_IS_LAST(ea) ((ea)->ea_flags & GFS2_EAFLAG_LAST)
@@ -27,10 +27,6 @@ ALIGN(sizeof(struct gfs2_ea_header) + (ea)->ea_name_len + \
#define GFS2_EAREQ_SIZE_STUFFED(er) \
ALIGN(sizeof(struct gfs2_ea_header) + (er)->er_name_len + (er)->er_data_len, 8)
-#define GFS2_EAREQ_SIZE_UNSTUFFED(sdp, er) \
-ALIGN(sizeof(struct gfs2_ea_header) + (er)->er_name_len + \
- sizeof(__be64) * DIV_ROUND_UP((er)->er_data_len, (sdp)->sd_jbsize), 8)
-
#define GFS2_EA2NAME(ea) ((char *)((struct gfs2_ea_header *)(ea) + 1))
#define GFS2_EA2DATA(ea) (GFS2_EA2NAME(ea) + (ea)->ea_name_len)
@@ -43,16 +39,12 @@ ALIGN(sizeof(struct gfs2_ea_header) + (er)->er_name_len + \
#define GFS2_EA_BH2FIRST(bh) \
((struct gfs2_ea_header *)((bh)->b_data + sizeof(struct gfs2_meta_header)))
-#define GFS2_ERF_MODE 0x80000000
-
struct gfs2_ea_request {
const char *er_name;
char *er_data;
unsigned int er_name_len;
unsigned int er_data_len;
unsigned int er_type; /* GFS2_EATYPE_... */
- int er_flags;
- mode_t er_mode;
};
struct gfs2_ea_location {
@@ -61,40 +53,20 @@ struct gfs2_ea_location {
struct gfs2_ea_header *el_prev;
};
-int gfs2_ea_get_i(struct gfs2_inode *ip, struct gfs2_ea_request *er);
-int gfs2_ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er);
-int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er);
-
-int gfs2_ea_list(struct gfs2_inode *ip, struct gfs2_ea_request *er);
-int gfs2_ea_get(struct gfs2_inode *ip, struct gfs2_ea_request *er);
-int gfs2_ea_set(struct gfs2_inode *ip, struct gfs2_ea_request *er);
-int gfs2_ea_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er);
-
-int gfs2_ea_dealloc(struct gfs2_inode *ip);
+extern int gfs2_xattr_get(struct inode *inode, int type, const char *name,
+ void *buffer, size_t size);
+extern int gfs2_xattr_set(struct inode *inode, int type, const char *name,
+ const void *value, size_t size, int flags);
+extern ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size);
+extern int gfs2_ea_dealloc(struct gfs2_inode *ip);
/* Exported to acl.c */
-int gfs2_ea_find(struct gfs2_inode *ip,
- struct gfs2_ea_request *er,
- struct gfs2_ea_location *el);
-int gfs2_ea_get_copy(struct gfs2_inode *ip,
- struct gfs2_ea_location *el,
- char *data);
-int gfs2_ea_acl_chmod(struct gfs2_inode *ip, struct gfs2_ea_location *el,
- struct iattr *attr, char *data);
-
-static inline unsigned int gfs2_ea_strlen(struct gfs2_ea_header *ea)
-{
- switch (ea->ea_type) {
- case GFS2_EATYPE_USR:
- return 5 + ea->ea_name_len + 1;
- case GFS2_EATYPE_SYS:
- return 7 + ea->ea_name_len + 1;
- case GFS2_EATYPE_SECURITY:
- return 9 + ea->ea_name_len + 1;
- default:
- return 0;
- }
-}
+extern int gfs2_ea_find(struct gfs2_inode *ip, int type, const char *name,
+ struct gfs2_ea_location *el);
+extern int gfs2_ea_get_copy(struct gfs2_inode *ip, struct gfs2_ea_location *el,
+ char *data, size_t size);
+extern int gfs2_ea_acl_chmod(struct gfs2_inode *ip, struct gfs2_ea_location *el,
+ struct iattr *attr, char *data);
#endif /* __EATTR_DOT_H__ */
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 941c8425c10b..06b7c2623f99 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -44,6 +44,7 @@ static const struct inode_operations hugetlbfs_dir_inode_operations;
static const struct inode_operations hugetlbfs_inode_operations;
static struct backing_dev_info hugetlbfs_backing_dev_info = {
+ .name = "hugetlbfs",
.ra_pages = 0, /* No readahead */
.capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
};
@@ -506,6 +507,13 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid,
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
INIT_LIST_HEAD(&inode->i_mapping->private_list);
info = HUGETLBFS_I(inode);
+ /*
+ * The policy is initialized here even if we are creating a
+ * private inode because initialization simply creates an
+ * an empty rb tree and calls spin_lock_init(), later when we
+ * call mpol_free_shared_policy() it will just return because
+ * the rb tree will still be empty.
+ */
mpol_shared_policy_init(&info->policy, NULL);
switch (mode & S_IFMT) {
default:
@@ -930,31 +938,39 @@ static struct file_system_type hugetlbfs_fs_type = {
static struct vfsmount *hugetlbfs_vfsmount;
-static int can_do_hugetlb_shm(void)
+static int can_do_hugetlb_shm(int creat_flags)
{
- return capable(CAP_IPC_LOCK) || in_group_p(sysctl_hugetlb_shm_group);
+ if (creat_flags != HUGETLB_SHMFS_INODE)
+ return 0;
+ if (capable(CAP_IPC_LOCK))
+ return 1;
+ if (in_group_p(sysctl_hugetlb_shm_group))
+ return 1;
+ return 0;
}
-struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag)
+struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag,
+ struct user_struct **user, int creat_flags)
{
int error = -ENOMEM;
- int unlock_shm = 0;
struct file *file;
struct inode *inode;
struct dentry *dentry, *root;
struct qstr quick_string;
- struct user_struct *user = current_user();
+ *user = NULL;
if (!hugetlbfs_vfsmount)
return ERR_PTR(-ENOENT);
- if (!can_do_hugetlb_shm()) {
- if (user_shm_lock(size, user)) {
- unlock_shm = 1;
+ if (!can_do_hugetlb_shm(creat_flags)) {
+ *user = current_user();
+ if (user_shm_lock(size, *user)) {
WARN_ONCE(1,
"Using mlock ulimits for SHM_HUGETLB deprecated\n");
- } else
+ } else {
+ *user = NULL;
return ERR_PTR(-EPERM);
+ }
}
root = hugetlbfs_vfsmount->mnt_root;
@@ -996,8 +1012,10 @@ out_inode:
out_dentry:
dput(dentry);
out_shm_unlock:
- if (unlock_shm)
- user_shm_unlock(size, user);
+ if (*user) {
+ user_shm_unlock(size, *user);
+ *user = NULL;
+ }
return ERR_PTR(error);
}
diff --git a/fs/inode.c b/fs/inode.c
index ae7b67e48661..f5ff71cb3e2b 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -123,7 +123,7 @@ static void wake_up_inode(struct inode *inode)
int inode_init_always(struct super_block *sb, struct inode *inode)
{
static const struct address_space_operations empty_aops;
- static struct inode_operations empty_iops;
+ static const struct inode_operations empty_iops;
static const struct file_operations empty_fops;
struct address_space *const mapping = &inode->i_data;
@@ -182,9 +182,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
if (sb->s_bdev) {
struct backing_dev_info *bdi;
- bdi = sb->s_bdev->bd_inode_backing_dev_info;
- if (!bdi)
- bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
+ bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
mapping->backing_dev_info = bdi;
}
inode->i_private = NULL;
@@ -697,13 +695,15 @@ void unlock_new_inode(struct inode *inode)
}
#endif
/*
- * This is special! We do not need the spinlock
- * when clearing I_LOCK, because we're guaranteed
- * that nobody else tries to do anything about the
- * state of the inode when it is locked, as we
- * just created it (so there can be no old holders
- * that haven't tested I_LOCK).
+ * This is special! We do not need the spinlock when clearing I_LOCK,
+ * because we're guaranteed that nobody else tries to do anything about
+ * the state of the inode when it is locked, as we just created it (so
+ * there can be no old holders that haven't tested I_LOCK).
+ * However we must emit the memory barrier so that other CPUs reliably
+ * see the clearing of I_LOCK after the other inode initialisation has
+ * completed.
*/
+ smp_mb();
WARN_ON((inode->i_state & (I_LOCK|I_NEW)) != (I_LOCK|I_NEW));
inode->i_state &= ~(I_LOCK|I_NEW);
wake_up_inode(inode);
diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c
index 61f32f3868cd..b0435dd0654d 100644
--- a/fs/jbd/checkpoint.c
+++ b/fs/jbd/checkpoint.c
@@ -456,7 +456,7 @@ int cleanup_journal_tail(journal_t *journal)
{
transaction_t * transaction;
tid_t first_tid;
- unsigned long blocknr, freed;
+ unsigned int blocknr, freed;
if (is_journal_aborted(journal))
return 1;
@@ -502,8 +502,8 @@ int cleanup_journal_tail(journal_t *journal)
freed = freed + journal->j_last - journal->j_first;
jbd_debug(1,
- "Cleaning journal tail from %d to %d (offset %lu), "
- "freeing %lu\n",
+ "Cleaning journal tail from %d to %d (offset %u), "
+ "freeing %u\n",
journal->j_tail_sequence, first_tid, blocknr, freed);
journal->j_free += freed;
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 618e21c0b7a3..4bd882548c45 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -308,7 +308,7 @@ void journal_commit_transaction(journal_t *journal)
int bufs;
int flags;
int err;
- unsigned long blocknr;
+ unsigned int blocknr;
ktime_t start_time;
u64 commit_time;
char *tagp = NULL;
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index f96f85092d1c..bd3c073b485d 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -276,7 +276,7 @@ static void journal_kill_thread(journal_t *journal)
int journal_write_metadata_buffer(transaction_t *transaction,
struct journal_head *jh_in,
struct journal_head **jh_out,
- unsigned long blocknr)
+ unsigned int blocknr)
{
int need_copy_out = 0;
int done_copy_out = 0;
@@ -567,9 +567,9 @@ int log_wait_commit(journal_t *journal, tid_t tid)
* Log buffer allocation routines:
*/
-int journal_next_log_block(journal_t *journal, unsigned long *retp)
+int journal_next_log_block(journal_t *journal, unsigned int *retp)
{
- unsigned long blocknr;
+ unsigned int blocknr;
spin_lock(&journal->j_state_lock);
J_ASSERT(journal->j_free > 1);
@@ -590,11 +590,11 @@ int journal_next_log_block(journal_t *journal, unsigned long *retp)
* this is a no-op. If needed, we can use j_blk_offset - everything is
* ready.
*/
-int journal_bmap(journal_t *journal, unsigned long blocknr,
- unsigned long *retp)
+int journal_bmap(journal_t *journal, unsigned int blocknr,
+ unsigned int *retp)
{
int err = 0;
- unsigned long ret;
+ unsigned int ret;
if (journal->j_inode) {
ret = bmap(journal->j_inode, blocknr);
@@ -604,7 +604,7 @@ int journal_bmap(journal_t *journal, unsigned long blocknr,
char b[BDEVNAME_SIZE];
printk(KERN_ALERT "%s: journal block not found "
- "at offset %lu on %s\n",
+ "at offset %u on %s\n",
__func__,
blocknr,
bdevname(journal->j_dev, b));
@@ -630,7 +630,7 @@ int journal_bmap(journal_t *journal, unsigned long blocknr,
struct journal_head *journal_get_descriptor_buffer(journal_t *journal)
{
struct buffer_head *bh;
- unsigned long blocknr;
+ unsigned int blocknr;
int err;
err = journal_next_log_block(journal, &blocknr);
@@ -774,7 +774,7 @@ journal_t * journal_init_inode (struct inode *inode)
journal_t *journal = journal_init_common();
int err;
int n;
- unsigned long blocknr;
+ unsigned int blocknr;
if (!journal)
return NULL;
@@ -846,12 +846,12 @@ static void journal_fail_superblock (journal_t *journal)
static int journal_reset(journal_t *journal)
{
journal_superblock_t *sb = journal->j_superblock;
- unsigned long first, last;
+ unsigned int first, last;
first = be32_to_cpu(sb->s_first);
last = be32_to_cpu(sb->s_maxlen);
if (first + JFS_MIN_JOURNAL_BLOCKS > last + 1) {
- printk(KERN_ERR "JBD: Journal too short (blocks %lu-%lu).\n",
+ printk(KERN_ERR "JBD: Journal too short (blocks %u-%u).\n",
first, last);
journal_fail_superblock(journal);
return -EINVAL;
@@ -885,7 +885,7 @@ static int journal_reset(journal_t *journal)
**/
int journal_create(journal_t *journal)
{
- unsigned long blocknr;
+ unsigned int blocknr;
struct buffer_head *bh;
journal_superblock_t *sb;
int i, err;
@@ -969,14 +969,14 @@ void journal_update_superblock(journal_t *journal, int wait)
if (sb->s_start == 0 && journal->j_tail_sequence ==
journal->j_transaction_sequence) {
jbd_debug(1,"JBD: Skipping superblock update on recovered sb "
- "(start %ld, seq %d, errno %d)\n",
+ "(start %u, seq %d, errno %d)\n",
journal->j_tail, journal->j_tail_sequence,
journal->j_errno);
goto out;
}
spin_lock(&journal->j_state_lock);
- jbd_debug(1,"JBD: updating superblock (start %ld, seq %d, errno %d)\n",
+ jbd_debug(1,"JBD: updating superblock (start %u, seq %d, errno %d)\n",
journal->j_tail, journal->j_tail_sequence, journal->j_errno);
sb->s_sequence = cpu_to_be32(journal->j_tail_sequence);
@@ -1371,7 +1371,7 @@ int journal_flush(journal_t *journal)
{
int err = 0;
transaction_t *transaction = NULL;
- unsigned long old_tail;
+ unsigned int old_tail;
spin_lock(&journal->j_state_lock);
diff --git a/fs/jbd/recovery.c b/fs/jbd/recovery.c
index db5e982c5ddf..cb1a49ae605e 100644
--- a/fs/jbd/recovery.c
+++ b/fs/jbd/recovery.c
@@ -70,7 +70,7 @@ static int do_readahead(journal_t *journal, unsigned int start)
{
int err;
unsigned int max, nbufs, next;
- unsigned long blocknr;
+ unsigned int blocknr;
struct buffer_head *bh;
struct buffer_head * bufs[MAXBUF];
@@ -132,7 +132,7 @@ static int jread(struct buffer_head **bhp, journal_t *journal,
unsigned int offset)
{
int err;
- unsigned long blocknr;
+ unsigned int blocknr;
struct buffer_head *bh;
*bhp = NULL;
@@ -314,7 +314,7 @@ static int do_one_pass(journal_t *journal,
struct recovery_info *info, enum passtype pass)
{
unsigned int first_commit_ID, next_commit_ID;
- unsigned long next_log_block;
+ unsigned int next_log_block;
int err, success = 0;
journal_superblock_t * sb;
journal_header_t * tmp;
@@ -367,14 +367,14 @@ static int do_one_pass(journal_t *journal,
if (tid_geq(next_commit_ID, info->end_transaction))
break;
- jbd_debug(2, "Scanning for sequence ID %u at %lu/%lu\n",
+ jbd_debug(2, "Scanning for sequence ID %u at %u/%u\n",
next_commit_ID, next_log_block, journal->j_last);
/* Skip over each chunk of the transaction looking
* either the next descriptor block or the final commit
* record. */
- jbd_debug(3, "JBD: checking block %ld\n", next_log_block);
+ jbd_debug(3, "JBD: checking block %u\n", next_log_block);
err = jread(&bh, journal, next_log_block);
if (err)
goto failed;
@@ -429,7 +429,7 @@ static int do_one_pass(journal_t *journal,
tagp = &bh->b_data[sizeof(journal_header_t)];
while ((tagp - bh->b_data +sizeof(journal_block_tag_t))
<= journal->j_blocksize) {
- unsigned long io_block;
+ unsigned int io_block;
tag = (journal_block_tag_t *) tagp;
flags = be32_to_cpu(tag->t_flags);
@@ -443,10 +443,10 @@ static int do_one_pass(journal_t *journal,
success = err;
printk (KERN_ERR
"JBD: IO error %d recovering "
- "block %ld in log\n",
+ "block %u in log\n",
err, io_block);
} else {
- unsigned long blocknr;
+ unsigned int blocknr;
J_ASSERT(obh != NULL);
blocknr = be32_to_cpu(tag->t_blocknr);
@@ -581,7 +581,7 @@ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
max = be32_to_cpu(header->r_count);
while (offset < max) {
- unsigned long blocknr;
+ unsigned int blocknr;
int err;
blocknr = be32_to_cpu(* ((__be32 *) (bh->b_data+offset)));
diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c
index da6cd9bdaabc..ad717328343a 100644
--- a/fs/jbd/revoke.c
+++ b/fs/jbd/revoke.c
@@ -101,7 +101,7 @@ struct jbd_revoke_record_s
{
struct list_head hash;
tid_t sequence; /* Used for recovery only */
- unsigned long blocknr;
+ unsigned int blocknr;
};
@@ -126,7 +126,7 @@ static void flush_descriptor(journal_t *, struct journal_head *, int, int);
/* Utility functions to maintain the revoke table */
/* Borrowed from buffer.c: this is a tried and tested block hash function */
-static inline int hash(journal_t *journal, unsigned long block)
+static inline int hash(journal_t *journal, unsigned int block)
{
struct jbd_revoke_table_s *table = journal->j_revoke;
int hash_shift = table->hash_shift;
@@ -136,7 +136,7 @@ static inline int hash(journal_t *journal, unsigned long block)
(block << (hash_shift - 12))) & (table->hash_size - 1);
}
-static int insert_revoke_hash(journal_t *journal, unsigned long blocknr,
+static int insert_revoke_hash(journal_t *journal, unsigned int blocknr,
tid_t seq)
{
struct list_head *hash_list;
@@ -166,7 +166,7 @@ oom:
/* Find a revoke record in the journal's hash table. */
static struct jbd_revoke_record_s *find_revoke_record(journal_t *journal,
- unsigned long blocknr)
+ unsigned int blocknr)
{
struct list_head *hash_list;
struct jbd_revoke_record_s *record;
@@ -332,7 +332,7 @@ void journal_destroy_revoke(journal_t *journal)
* by one.
*/
-int journal_revoke(handle_t *handle, unsigned long blocknr,
+int journal_revoke(handle_t *handle, unsigned int blocknr,
struct buffer_head *bh_in)
{
struct buffer_head *bh = NULL;
@@ -401,7 +401,7 @@ int journal_revoke(handle_t *handle, unsigned long blocknr,
}
}
- jbd_debug(2, "insert revoke for block %lu, bh_in=%p\n", blocknr, bh_in);
+ jbd_debug(2, "insert revoke for block %u, bh_in=%p\n", blocknr, bh_in);
err = insert_revoke_hash(journal, blocknr,
handle->h_transaction->t_tid);
BUFFER_TRACE(bh_in, "exit");
@@ -644,7 +644,7 @@ static void flush_descriptor(journal_t *journal,
*/
int journal_set_revoke(journal_t *journal,
- unsigned long blocknr,
+ unsigned int blocknr,
tid_t sequence)
{
struct jbd_revoke_record_s *record;
@@ -668,7 +668,7 @@ int journal_set_revoke(journal_t *journal,
*/
int journal_test_revoke(journal_t *journal,
- unsigned long blocknr,
+ unsigned int blocknr,
tid_t sequence)
{
struct jbd_revoke_record_s *record;
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index c03ac11f74be..006f9ad838a2 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -56,7 +56,8 @@ get_transaction(journal_t *journal, transaction_t *transaction)
spin_lock_init(&transaction->t_handle_lock);
/* Set up the commit timer for the new transaction. */
- journal->j_commit_timer.expires = round_jiffies(transaction->t_expires);
+ journal->j_commit_timer.expires =
+ round_jiffies_up(transaction->t_expires);
add_timer(&journal->j_commit_timer);
J_ASSERT(journal->j_running_transaction == NULL);
@@ -228,6 +229,8 @@ repeat_locked:
__log_space_left(journal));
spin_unlock(&transaction->t_handle_lock);
spin_unlock(&journal->j_state_lock);
+
+ lock_map_acquire(&handle->h_lockdep_map);
out:
if (unlikely(new_transaction)) /* It's usually NULL */
kfree(new_transaction);
@@ -292,9 +295,6 @@ handle_t *journal_start(journal_t *journal, int nblocks)
handle = ERR_PTR(err);
goto out;
}
-
- lock_map_acquire(&handle->h_lockdep_map);
-
out:
return handle;
}
@@ -416,6 +416,7 @@ int journal_restart(handle_t *handle, int nblocks)
__log_start_commit(journal, transaction->t_tid);
spin_unlock(&journal->j_state_lock);
+ lock_map_release(&handle->h_lockdep_map);
handle->h_buffer_credits = nblocks;
ret = start_this_handle(journal, handle);
return ret;
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 7b4088b2364d..26d991ddc1e6 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -25,6 +25,7 @@
#include <linux/writeback.h>
#include <linux/backing-dev.h>
#include <linux/bio.h>
+#include <linux/blkdev.h>
#include <trace/events/jbd2.h>
/*
@@ -133,8 +134,8 @@ static int journal_submit_commit_record(journal_t *journal,
bh->b_end_io = journal_end_buffer_io_sync;
if (journal->j_flags & JBD2_BARRIER &&
- !JBD2_HAS_INCOMPAT_FEATURE(journal,
- JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
+ !JBD2_HAS_INCOMPAT_FEATURE(journal,
+ JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
set_buffer_ordered(bh);
barrier_done = 1;
}
@@ -220,7 +221,6 @@ static int journal_submit_inode_data_buffers(struct address_space *mapping)
.nr_to_write = mapping->nrpages * 2,
.range_start = 0,
.range_end = i_size_read(mapping->host),
- .for_writepages = 1,
};
ret = generic_writepages(mapping, &wbc);
@@ -707,11 +707,13 @@ start_journal_io:
/* Done it all: now write the commit record asynchronously. */
if (JBD2_HAS_INCOMPAT_FEATURE(journal,
- JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
+ JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
err = journal_submit_commit_record(journal, commit_transaction,
&cbh, crc32_sum);
if (err)
__jbd2_journal_abort_hard(journal);
+ if (journal->j_flags & JBD2_BARRIER)
+ blkdev_issue_flush(journal->j_dev, NULL);
}
/*
@@ -834,7 +836,7 @@ wait_for_iobuf:
jbd_debug(3, "JBD: commit phase 5\n");
if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
- JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
+ JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
err = journal_submit_commit_record(journal, commit_transaction,
&cbh, crc32_sum);
if (err)
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index e378cb383979..a8a358bc0f21 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1187,6 +1187,12 @@ static int journal_reset(journal_t *journal)
first = be32_to_cpu(sb->s_first);
last = be32_to_cpu(sb->s_maxlen);
+ if (first + JBD2_MIN_JOURNAL_BLOCKS > last + 1) {
+ printk(KERN_ERR "JBD: Journal too short (blocks %llu-%llu).\n",
+ first, last);
+ journal_fail_superblock(journal);
+ return -EINVAL;
+ }
journal->j_first = first;
journal->j_last = last;
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 6213ac728f30..a0512700542f 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -57,7 +57,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
INIT_LIST_HEAD(&transaction->t_private_list);
/* Set up the commit timer for the new transaction. */
- journal->j_commit_timer.expires = round_jiffies(transaction->t_expires);
+ journal->j_commit_timer.expires = round_jiffies_up(transaction->t_expires);
add_timer(&journal->j_commit_timer);
J_ASSERT(journal->j_running_transaction == NULL);
@@ -238,6 +238,8 @@ repeat_locked:
__jbd2_log_space_left(journal));
spin_unlock(&transaction->t_handle_lock);
spin_unlock(&journal->j_state_lock);
+
+ lock_map_acquire(&handle->h_lockdep_map);
out:
if (unlikely(new_transaction)) /* It's usually NULL */
kfree(new_transaction);
@@ -303,8 +305,6 @@ handle_t *jbd2_journal_start(journal_t *journal, int nblocks)
handle = ERR_PTR(err);
goto out;
}
-
- lock_map_acquire(&handle->h_lockdep_map);
out:
return handle;
}
@@ -426,6 +426,7 @@ int jbd2_journal_restart(handle_t *handle, int nblocks)
__jbd2_log_start_commit(journal, transaction->t_tid);
spin_unlock(&journal->j_state_lock);
+ lock_map_release(&handle->h_lockdep_map);
handle->h_buffer_credits = nblocks;
ret = start_this_handle(journal, handle);
return ret;
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index 8fcb6239218e..7edb62e97419 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -258,7 +258,7 @@ static int jffs2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
return rc;
}
-static int jffs2_check_acl(struct inode *inode, int mask)
+int jffs2_check_acl(struct inode *inode, int mask)
{
struct posix_acl *acl;
int rc;
@@ -274,11 +274,6 @@ static int jffs2_check_acl(struct inode *inode, int mask)
return -EAGAIN;
}
-int jffs2_permission(struct inode *inode, int mask)
-{
- return generic_permission(inode, mask, jffs2_check_acl);
-}
-
int jffs2_init_acl_pre(struct inode *dir_i, struct inode *inode, int *i_mode)
{
struct posix_acl *acl, *clone;
diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h
index fc929f2a14f6..f0ba63e3c36b 100644
--- a/fs/jffs2/acl.h
+++ b/fs/jffs2/acl.h
@@ -26,7 +26,7 @@ struct jffs2_acl_header {
#ifdef CONFIG_JFFS2_FS_POSIX_ACL
-extern int jffs2_permission(struct inode *, int);
+extern int jffs2_check_acl(struct inode *, int);
extern int jffs2_acl_chmod(struct inode *);
extern int jffs2_init_acl_pre(struct inode *, struct inode *, int *);
extern int jffs2_init_acl_post(struct inode *);
@@ -36,7 +36,7 @@ extern struct xattr_handler jffs2_acl_default_xattr_handler;
#else
-#define jffs2_permission (NULL)
+#define jffs2_check_acl (NULL)
#define jffs2_acl_chmod(inode) (0)
#define jffs2_init_acl_pre(dir_i,inode,mode) (0)
#define jffs2_init_acl_post(inode) (0)
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index 6f60cc910f4c..7aa4417e085f 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -55,7 +55,7 @@ const struct inode_operations jffs2_dir_inode_operations =
.rmdir = jffs2_rmdir,
.mknod = jffs2_mknod,
.rename = jffs2_rename,
- .permission = jffs2_permission,
+ .check_acl = jffs2_check_acl,
.setattr = jffs2_setattr,
.setxattr = jffs2_setxattr,
.getxattr = jffs2_getxattr,
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index 23c947539864..b7b74e299142 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -56,7 +56,7 @@ const struct file_operations jffs2_file_operations =
const struct inode_operations jffs2_file_inode_operations =
{
- .permission = jffs2_permission,
+ .check_acl = jffs2_check_acl,
.setattr = jffs2_setattr,
.setxattr = jffs2_setxattr,
.getxattr = jffs2_getxattr,
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 0035c021395a..9a80e8e595d0 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -123,7 +123,7 @@ static struct dentry *jffs2_get_parent(struct dentry *child)
return d_obtain_alias(jffs2_iget(child->d_inode->i_sb, pino));
}
-static struct export_operations jffs2_export_ops = {
+static const struct export_operations jffs2_export_ops = {
.get_parent = jffs2_get_parent,
.fh_to_dentry = jffs2_fh_to_dentry,
.fh_to_parent = jffs2_fh_to_parent,
diff --git a/fs/jffs2/symlink.c b/fs/jffs2/symlink.c
index b7339c3b6ad9..4ec11e8bda8c 100644
--- a/fs/jffs2/symlink.c
+++ b/fs/jffs2/symlink.c
@@ -21,7 +21,7 @@ const struct inode_operations jffs2_symlink_inode_operations =
{
.readlink = generic_readlink,
.follow_link = jffs2_follow_link,
- .permission = jffs2_permission,
+ .check_acl = jffs2_check_acl,
.setattr = jffs2_setattr,
.setxattr = jffs2_setxattr,
.getxattr = jffs2_getxattr,
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index d9a721e6db70..5ef7bac265e5 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -1268,10 +1268,20 @@ int jffs2_nor_wbuf_flash_setup(struct jffs2_sb_info *c) {
if (!c->wbuf)
return -ENOMEM;
+#ifdef CONFIG_JFFS2_FS_WBUF_VERIFY
+ c->wbuf_verify = kmalloc(c->wbuf_pagesize, GFP_KERNEL);
+ if (!c->wbuf_verify) {
+ kfree(c->wbuf);
+ return -ENOMEM;
+ }
+#endif
return 0;
}
void jffs2_nor_wbuf_flash_cleanup(struct jffs2_sb_info *c) {
+#ifdef CONFIG_JFFS2_FS_WBUF_VERIFY
+ kfree(c->wbuf_verify);
+#endif
kfree(c->wbuf);
}
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index a29c7c3e3fb8..d66477c34306 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -114,7 +114,7 @@ out:
return rc;
}
-static int jfs_check_acl(struct inode *inode, int mask)
+int jfs_check_acl(struct inode *inode, int mask)
{
struct posix_acl *acl = jfs_get_acl(inode, ACL_TYPE_ACCESS);
@@ -129,11 +129,6 @@ static int jfs_check_acl(struct inode *inode, int mask)
return -EAGAIN;
}
-int jfs_permission(struct inode *inode, int mask)
-{
- return generic_permission(inode, mask, jfs_check_acl);
-}
-
int jfs_init_acl(tid_t tid, struct inode *inode, struct inode *dir)
{
struct posix_acl *acl = NULL;
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index 7f6063acaa3b..2b70fa78e4a7 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -96,7 +96,7 @@ const struct inode_operations jfs_file_inode_operations = {
.removexattr = jfs_removexattr,
#ifdef CONFIG_JFS_POSIX_ACL
.setattr = jfs_setattr,
- .permission = jfs_permission,
+ .check_acl = jfs_check_acl,
#endif
};
diff --git a/fs/jfs/jfs_acl.h b/fs/jfs/jfs_acl.h
index 88475f10a389..b07bd417ef85 100644
--- a/fs/jfs/jfs_acl.h
+++ b/fs/jfs/jfs_acl.h
@@ -20,7 +20,7 @@
#ifdef CONFIG_JFS_POSIX_ACL
-int jfs_permission(struct inode *, int);
+int jfs_check_acl(struct inode *, int);
int jfs_init_acl(tid_t, struct inode *, struct inode *);
int jfs_setattr(struct dentry *, struct iattr *);
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 514ee2edb92a..c79a4270f083 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -1543,7 +1543,7 @@ const struct inode_operations jfs_dir_inode_operations = {
.removexattr = jfs_removexattr,
#ifdef CONFIG_JFS_POSIX_ACL
.setattr = jfs_setattr,
- .permission = jfs_permission,
+ .check_acl = jfs_check_acl,
#endif
};
diff --git a/fs/libfs.c b/fs/libfs.c
index ddfa89948c3f..dcec3d3ea64f 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -217,7 +217,7 @@ int get_sb_pseudo(struct file_system_type *fs_type, char *name,
return PTR_ERR(s);
s->s_flags = MS_NOUSER;
- s->s_maxbytes = ~0ULL;
+ s->s_maxbytes = MAX_LFS_FILESIZE;
s->s_blocksize = PAGE_SIZE;
s->s_blocksize_bits = PAGE_SHIFT;
s->s_magic = magic;
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index 1f3b0fc0d351..fc9032dc8862 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -166,7 +166,7 @@ __be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock)
*/
if (fl_blocked->fl_u.nfs_fl.owner->pid != lock->svid)
continue;
- if (!nlm_cmp_addr(nlm_addr(block->b_host), addr))
+ if (!rpc_cmp_addr(nlm_addr(block->b_host), addr))
continue;
if (nfs_compare_fh(NFS_FH(fl_blocked->fl_file->f_path.dentry->d_inode) ,fh) != 0)
continue;
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 4336adba952a..c81249fef11f 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -458,7 +458,7 @@ static void nlmclnt_locks_release_private(struct file_lock *fl)
nlm_put_lockowner(fl->fl_u.nfs_fl.owner);
}
-static struct file_lock_operations nlmclnt_lock_ops = {
+static const struct file_lock_operations nlmclnt_lock_ops = {
.fl_copy_lock = nlmclnt_locks_copy_lock,
.fl_release_private = nlmclnt_locks_release_private,
};
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index 99d737bd4325..4600c2037b8b 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -87,18 +87,6 @@ static unsigned int nlm_hash_address(const struct sockaddr *sap)
return hash & (NLM_HOST_NRHASH - 1);
}
-static void nlm_clear_port(struct sockaddr *sap)
-{
- switch (sap->sa_family) {
- case AF_INET:
- ((struct sockaddr_in *)sap)->sin_port = 0;
- break;
- case AF_INET6:
- ((struct sockaddr_in6 *)sap)->sin6_port = 0;
- break;
- }
-}
-
/*
* Common host lookup routine for server & client
*/
@@ -123,7 +111,7 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni)
*/
chain = &nlm_hosts[nlm_hash_address(ni->sap)];
hlist_for_each_entry(host, pos, chain, h_hash) {
- if (!nlm_cmp_addr(nlm_addr(host), ni->sap))
+ if (!rpc_cmp_addr(nlm_addr(host), ni->sap))
continue;
/* See if we have an NSM handle for this client */
@@ -137,7 +125,7 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni)
if (host->h_server != ni->server)
continue;
if (ni->server &&
- !nlm_cmp_addr(nlm_srcaddr(host), ni->src_sap))
+ !rpc_cmp_addr(nlm_srcaddr(host), ni->src_sap))
continue;
/* Move to head of hash chain. */
@@ -177,7 +165,7 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni)
host->h_addrbuf = nsm->sm_addrbuf;
memcpy(nlm_addr(host), ni->sap, ni->salen);
host->h_addrlen = ni->salen;
- nlm_clear_port(nlm_addr(host));
+ rpc_set_port(nlm_addr(host), 0);
memcpy(nlm_srcaddr(host), ni->src_sap, ni->src_len);
host->h_version = ni->version;
host->h_proto = ni->protocol;
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 7fce1b525849..f956651d0f65 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -61,43 +61,6 @@ static inline struct sockaddr *nsm_addr(const struct nsm_handle *nsm)
return (struct sockaddr *)&nsm->sm_addr;
}
-static void nsm_display_ipv4_address(const struct sockaddr *sap, char *buf,
- const size_t len)
-{
- const struct sockaddr_in *sin = (struct sockaddr_in *)sap;
- snprintf(buf, len, "%pI4", &sin->sin_addr.s_addr);
-}
-
-static void nsm_display_ipv6_address(const struct sockaddr *sap, char *buf,
- const size_t len)
-{
- const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
-
- if (ipv6_addr_v4mapped(&sin6->sin6_addr))
- snprintf(buf, len, "%pI4", &sin6->sin6_addr.s6_addr32[3]);
- else if (sin6->sin6_scope_id != 0)
- snprintf(buf, len, "%pI6%%%u", &sin6->sin6_addr,
- sin6->sin6_scope_id);
- else
- snprintf(buf, len, "%pI6", &sin6->sin6_addr);
-}
-
-static void nsm_display_address(const struct sockaddr *sap,
- char *buf, const size_t len)
-{
- switch (sap->sa_family) {
- case AF_INET:
- nsm_display_ipv4_address(sap, buf, len);
- break;
- case AF_INET6:
- nsm_display_ipv6_address(sap, buf, len);
- break;
- default:
- snprintf(buf, len, "unsupported address family");
- break;
- }
-}
-
static struct rpc_clnt *nsm_create(void)
{
struct sockaddr_in sin = {
@@ -246,7 +209,7 @@ static struct nsm_handle *nsm_lookup_addr(const struct sockaddr *sap)
struct nsm_handle *nsm;
list_for_each_entry(nsm, &nsm_handles, sm_link)
- if (nlm_cmp_addr(nsm_addr(nsm), sap))
+ if (rpc_cmp_addr(nsm_addr(nsm), sap))
return nsm;
return NULL;
}
@@ -307,8 +270,11 @@ static struct nsm_handle *nsm_create_handle(const struct sockaddr *sap,
memcpy(nsm_addr(new), sap, salen);
new->sm_addrlen = salen;
nsm_init_private(new);
- nsm_display_address((const struct sockaddr *)&new->sm_addr,
- new->sm_addrbuf, sizeof(new->sm_addrbuf));
+
+ if (rpc_ntop(nsm_addr(new), new->sm_addrbuf,
+ sizeof(new->sm_addrbuf)) == 0)
+ (void)snprintf(new->sm_addrbuf, sizeof(new->sm_addrbuf),
+ "unsupported address family");
memcpy(new->sm_name, hostname, hostname_len);
new->sm_name[hostname_len] = '\0';
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index e577a78d7bac..d1001790fa9a 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -705,7 +705,7 @@ static int nlmsvc_same_owner(struct file_lock *fl1, struct file_lock *fl2)
return fl1->fl_owner == fl2->fl_owner && fl1->fl_pid == fl2->fl_pid;
}
-struct lock_manager_operations nlmsvc_lock_operations = {
+const struct lock_manager_operations nlmsvc_lock_operations = {
.fl_compare_owner = nlmsvc_same_owner,
.fl_notify = nlmsvc_notify_blocked,
.fl_grant = nlmsvc_grant_deferred,
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index 9e4d6aab611b..ad478da7ca63 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -417,7 +417,7 @@ EXPORT_SYMBOL_GPL(nlmsvc_unlock_all_by_sb);
static int
nlmsvc_match_ip(void *datap, struct nlm_host *host)
{
- return nlm_cmp_addr(nlm_srcaddr(host), datap);
+ return rpc_cmp_addr(nlm_srcaddr(host), datap);
}
/**
diff --git a/fs/locks.c b/fs/locks.c
index b6440f52178f..a8794f233bc9 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -434,7 +434,7 @@ static int lease_mylease_callback(struct file_lock *fl, struct file_lock *try)
return fl->fl_file == try->fl_file;
}
-static struct lock_manager_operations lease_manager_ops = {
+static const struct lock_manager_operations lease_manager_ops = {
.fl_break = lease_break_callback,
.fl_release_private = lease_release_private_callback,
.fl_mylease = lease_mylease_callback,
@@ -768,7 +768,7 @@ static int flock_lock_file(struct file *filp, struct file_lock *request)
* give it the opportunity to lock the file.
*/
if (found)
- cond_resched_bkl();
+ cond_resched();
find_conflict:
for_each_lock(inode, before) {
@@ -1591,7 +1591,7 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
if (can_sleep)
lock->fl_flags |= FL_SLEEP;
- error = security_file_lock(filp, cmd);
+ error = security_file_lock(filp, lock->fl_type);
if (error)
goto out_free;
diff --git a/fs/namei.c b/fs/namei.c
index f3c5b278895a..d11f404667e9 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -169,19 +169,10 @@ void putname(const char *name)
EXPORT_SYMBOL(putname);
#endif
-
-/**
- * generic_permission - check for access rights on a Posix-like filesystem
- * @inode: inode to check access rights for
- * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
- * @check_acl: optional callback to check for Posix ACLs
- *
- * Used to check for read/write/execute permissions on a file.
- * We use "fsuid" for this, letting us set arbitrary permissions
- * for filesystem access without changing the "normal" uids which
- * are used for other things..
+/*
+ * This does basic POSIX ACL permission checking
*/
-int generic_permission(struct inode *inode, int mask,
+static int acl_permission_check(struct inode *inode, int mask,
int (*check_acl)(struct inode *inode, int mask))
{
umode_t mode = inode->i_mode;
@@ -193,9 +184,7 @@ int generic_permission(struct inode *inode, int mask,
else {
if (IS_POSIXACL(inode) && (mode & S_IRWXG) && check_acl) {
int error = check_acl(inode, mask);
- if (error == -EACCES)
- goto check_capabilities;
- else if (error != -EAGAIN)
+ if (error != -EAGAIN)
return error;
}
@@ -208,8 +197,32 @@ int generic_permission(struct inode *inode, int mask,
*/
if ((mask & ~mode) == 0)
return 0;
+ return -EACCES;
+}
+
+/**
+ * generic_permission - check for access rights on a Posix-like filesystem
+ * @inode: inode to check access rights for
+ * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
+ * @check_acl: optional callback to check for Posix ACLs
+ *
+ * Used to check for read/write/execute permissions on a file.
+ * We use "fsuid" for this, letting us set arbitrary permissions
+ * for filesystem access without changing the "normal" uids which
+ * are used for other things..
+ */
+int generic_permission(struct inode *inode, int mask,
+ int (*check_acl)(struct inode *inode, int mask))
+{
+ int ret;
+
+ /*
+ * Do the basic POSIX ACL permission checks.
+ */
+ ret = acl_permission_check(inode, mask, check_acl);
+ if (ret != -EACCES)
+ return ret;
- check_capabilities:
/*
* Read/write DACs are always overridable.
* Executable DACs are overridable if at least one exec bit is set.
@@ -262,7 +275,7 @@ int inode_permission(struct inode *inode, int mask)
if (inode->i_op->permission)
retval = inode->i_op->permission(inode, mask);
else
- retval = generic_permission(inode, mask, NULL);
+ retval = generic_permission(inode, mask, inode->i_op->check_acl);
if (retval)
return retval;
@@ -432,29 +445,22 @@ static struct dentry * cached_lookup(struct dentry * parent, struct qstr * name,
*/
static int exec_permission_lite(struct inode *inode)
{
- umode_t mode = inode->i_mode;
+ int ret;
- if (inode->i_op->permission)
- return -EAGAIN;
-
- if (current_fsuid() == inode->i_uid)
- mode >>= 6;
- else if (in_group_p(inode->i_gid))
- mode >>= 3;
-
- if (mode & MAY_EXEC)
- goto ok;
-
- if ((inode->i_mode & S_IXUGO) && capable(CAP_DAC_OVERRIDE))
- goto ok;
-
- if (S_ISDIR(inode->i_mode) && capable(CAP_DAC_OVERRIDE))
+ if (inode->i_op->permission) {
+ ret = inode->i_op->permission(inode, MAY_EXEC);
+ if (!ret)
+ goto ok;
+ return ret;
+ }
+ ret = acl_permission_check(inode, MAY_EXEC, inode->i_op->check_acl);
+ if (!ret)
goto ok;
- if (S_ISDIR(inode->i_mode) && capable(CAP_DAC_READ_SEARCH))
+ if (capable(CAP_DAC_OVERRIDE) || capable(CAP_DAC_READ_SEARCH))
goto ok;
- return -EACCES;
+ return ret;
ok:
return security_inode_permission(inode, MAY_EXEC);
}
@@ -853,12 +859,6 @@ static int __link_path_walk(const char *name, struct nameidata *nd)
nd->flags |= LOOKUP_CONTINUE;
err = exec_permission_lite(inode);
- if (err == -EAGAIN)
- err = inode_permission(nd->path.dentry->d_inode,
- MAY_EXEC);
- if (!err)
- err = ima_path_check(&nd->path, MAY_EXEC,
- IMA_COUNT_UPDATE);
if (err)
break;
@@ -1533,37 +1533,42 @@ int may_open(struct path *path, int acc_mode, int flag)
if (error)
return error;
- error = ima_path_check(path,
- acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC),
+ error = ima_path_check(path, acc_mode ?
+ acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC) :
+ ACC_MODE(flag) & (MAY_READ | MAY_WRITE),
IMA_COUNT_UPDATE);
+
if (error)
return error;
/*
* An append-only file must be opened in append mode for writing.
*/
if (IS_APPEND(inode)) {
+ error = -EPERM;
if ((flag & FMODE_WRITE) && !(flag & O_APPEND))
- return -EPERM;
+ goto err_out;
if (flag & O_TRUNC)
- return -EPERM;
+ goto err_out;
}
/* O_NOATIME can only be set by the owner or superuser */
if (flag & O_NOATIME)
- if (!is_owner_or_cap(inode))
- return -EPERM;
+ if (!is_owner_or_cap(inode)) {
+ error = -EPERM;
+ goto err_out;
+ }
/*
* Ensure there are no outstanding leases on the file.
*/
error = break_lease(inode, flag);
if (error)
- return error;
+ goto err_out;
if (flag & O_TRUNC) {
error = get_write_access(inode);
if (error)
- return error;
+ goto err_out;
/*
* Refuse to truncate files with mandatory locks held on them.
@@ -1581,12 +1586,17 @@ int may_open(struct path *path, int acc_mode, int flag)
}
put_write_access(inode);
if (error)
- return error;
+ goto err_out;
} else
if (flag & FMODE_WRITE)
vfs_dq_init(inode);
return 0;
+err_out:
+ ima_counts_put(path, acc_mode ?
+ acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC) :
+ ACC_MODE(flag) & (MAY_READ | MAY_WRITE));
+ return error;
}
/*
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index 845159814de2..da7fda639eac 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -6,7 +6,8 @@ obj-$(CONFIG_NFS_FS) += nfs.o
nfs-y := client.o dir.o file.o getroot.o inode.o super.o nfs2xdr.o \
direct.o pagelist.o proc.o read.o symlink.o unlink.o \
- write.o namespace.o mount_clnt.o
+ write.o namespace.o mount_clnt.o \
+ dns_resolve.o cache_lib.o
nfs-$(CONFIG_ROOT_NFS) += nfsroot.o
nfs-$(CONFIG_NFS_V3) += nfs3proc.o nfs3xdr.o
nfs-$(CONFIG_NFS_V3_ACL) += nfs3acl.o
diff --git a/fs/nfs/cache_lib.c b/fs/nfs/cache_lib.c
new file mode 100644
index 000000000000..b4ffd0146ea6
--- /dev/null
+++ b/fs/nfs/cache_lib.c
@@ -0,0 +1,140 @@
+/*
+ * linux/fs/nfs/cache_lib.c
+ *
+ * Helper routines for the NFS client caches
+ *
+ * Copyright (c) 2009 Trond Myklebust <Trond.Myklebust@netapp.com>
+ */
+#include <linux/kmod.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/sunrpc/cache.h>
+#include <linux/sunrpc/rpc_pipe_fs.h>
+
+#include "cache_lib.h"
+
+#define NFS_CACHE_UPCALL_PATHLEN 256
+#define NFS_CACHE_UPCALL_TIMEOUT 15
+
+static char nfs_cache_getent_prog[NFS_CACHE_UPCALL_PATHLEN] =
+ "/sbin/nfs_cache_getent";
+static unsigned long nfs_cache_getent_timeout = NFS_CACHE_UPCALL_TIMEOUT;
+
+module_param_string(cache_getent, nfs_cache_getent_prog,
+ sizeof(nfs_cache_getent_prog), 0600);
+MODULE_PARM_DESC(cache_getent, "Path to the client cache upcall program");
+module_param_named(cache_getent_timeout, nfs_cache_getent_timeout, ulong, 0600);
+MODULE_PARM_DESC(cache_getent_timeout, "Timeout (in seconds) after which "
+ "the cache upcall is assumed to have failed");
+
+int nfs_cache_upcall(struct cache_detail *cd, char *entry_name)
+{
+ static char *envp[] = { "HOME=/",
+ "TERM=linux",
+ "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
+ NULL
+ };
+ char *argv[] = {
+ nfs_cache_getent_prog,
+ cd->name,
+ entry_name,
+ NULL
+ };
+ int ret = -EACCES;
+
+ if (nfs_cache_getent_prog[0] == '\0')
+ goto out;
+ ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
+ /*
+ * Disable the upcall mechanism if we're getting an ENOENT or
+ * EACCES error. The admin can re-enable it on the fly by using
+ * sysfs to set the 'cache_getent' parameter once the problem
+ * has been fixed.
+ */
+ if (ret == -ENOENT || ret == -EACCES)
+ nfs_cache_getent_prog[0] = '\0';
+out:
+ return ret > 0 ? 0 : ret;
+}
+
+/*
+ * Deferred request handling
+ */
+void nfs_cache_defer_req_put(struct nfs_cache_defer_req *dreq)
+{
+ if (atomic_dec_and_test(&dreq->count))
+ kfree(dreq);
+}
+
+static void nfs_dns_cache_revisit(struct cache_deferred_req *d, int toomany)
+{
+ struct nfs_cache_defer_req *dreq;
+
+ dreq = container_of(d, struct nfs_cache_defer_req, deferred_req);
+
+ complete_all(&dreq->completion);
+ nfs_cache_defer_req_put(dreq);
+}
+
+static struct cache_deferred_req *nfs_dns_cache_defer(struct cache_req *req)
+{
+ struct nfs_cache_defer_req *dreq;
+
+ dreq = container_of(req, struct nfs_cache_defer_req, req);
+ dreq->deferred_req.revisit = nfs_dns_cache_revisit;
+ atomic_inc(&dreq->count);
+
+ return &dreq->deferred_req;
+}
+
+struct nfs_cache_defer_req *nfs_cache_defer_req_alloc(void)
+{
+ struct nfs_cache_defer_req *dreq;
+
+ dreq = kzalloc(sizeof(*dreq), GFP_KERNEL);
+ if (dreq) {
+ init_completion(&dreq->completion);
+ atomic_set(&dreq->count, 1);
+ dreq->req.defer = nfs_dns_cache_defer;
+ }
+ return dreq;
+}
+
+int nfs_cache_wait_for_upcall(struct nfs_cache_defer_req *dreq)
+{
+ if (wait_for_completion_timeout(&dreq->completion,
+ nfs_cache_getent_timeout * HZ) == 0)
+ return -ETIMEDOUT;
+ return 0;
+}
+
+int nfs_cache_register(struct cache_detail *cd)
+{
+ struct nameidata nd;
+ struct vfsmount *mnt;
+ int ret;
+
+ mnt = rpc_get_mount();
+ if (IS_ERR(mnt))
+ return PTR_ERR(mnt);
+ ret = vfs_path_lookup(mnt->mnt_root, mnt, "/cache", 0, &nd);
+ if (ret)
+ goto err;
+ ret = sunrpc_cache_register_pipefs(nd.path.dentry,
+ cd->name, 0600, cd);
+ path_put(&nd.path);
+ if (!ret)
+ return ret;
+err:
+ rpc_put_mount();
+ return ret;
+}
+
+void nfs_cache_unregister(struct cache_detail *cd)
+{
+ sunrpc_cache_unregister_pipefs(cd);
+ rpc_put_mount();
+}
+
diff --git a/fs/nfs/cache_lib.h b/fs/nfs/cache_lib.h
new file mode 100644
index 000000000000..76f856e284e4
--- /dev/null
+++ b/fs/nfs/cache_lib.h
@@ -0,0 +1,27 @@
+/*
+ * Helper routines for the NFS client caches
+ *
+ * Copyright (c) 2009 Trond Myklebust <Trond.Myklebust@netapp.com>
+ */
+
+#include <linux/completion.h>
+#include <linux/sunrpc/cache.h>
+#include <asm/atomic.h>
+
+/*
+ * Deferred request handling
+ */
+struct nfs_cache_defer_req {
+ struct cache_req req;
+ struct cache_deferred_req deferred_req;
+ struct completion completion;
+ atomic_t count;
+};
+
+extern int nfs_cache_upcall(struct cache_detail *cd, char *entry_name);
+extern struct nfs_cache_defer_req *nfs_cache_defer_req_alloc(void);
+extern void nfs_cache_defer_req_put(struct nfs_cache_defer_req *dreq);
+extern int nfs_cache_wait_for_upcall(struct nfs_cache_defer_req *dreq);
+
+extern int nfs_cache_register(struct cache_detail *cd);
+extern void nfs_cache_unregister(struct cache_detail *cd);
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 7f604c7941fb..293fa0528a6e 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -43,21 +43,29 @@ static struct svc_program nfs4_callback_program;
unsigned int nfs_callback_set_tcpport;
unsigned short nfs_callback_tcpport;
unsigned short nfs_callback_tcpport6;
-static const int nfs_set_port_min = 0;
-static const int nfs_set_port_max = 65535;
+#define NFS_CALLBACK_MAXPORTNR (65535U)
-static int param_set_port(const char *val, struct kernel_param *kp)
+static int param_set_portnr(const char *val, struct kernel_param *kp)
{
- char *endp;
- int num = simple_strtol(val, &endp, 0);
- if (endp == val || *endp || num < nfs_set_port_min || num > nfs_set_port_max)
+ unsigned long num;
+ int ret;
+
+ if (!val)
+ return -EINVAL;
+ ret = strict_strtoul(val, 0, &num);
+ if (ret == -EINVAL || num > NFS_CALLBACK_MAXPORTNR)
return -EINVAL;
- *((int *)kp->arg) = num;
+ *((unsigned int *)kp->arg) = num;
return 0;
}
-module_param_call(callback_tcpport, param_set_port, param_get_int,
- &nfs_callback_set_tcpport, 0644);
+static int param_get_portnr(char *buffer, struct kernel_param *kp)
+{
+ return param_get_uint(buffer, kp);
+}
+#define param_check_portnr(name, p) __param_check(name, p, unsigned int);
+
+module_param_named(callback_tcpport, nfs_callback_set_tcpport, portnr, 0644);
/*
* This is the NFSv4 callback kernel thread.
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index e5a2dac5f715..76b0aa0f73bf 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -222,7 +222,7 @@ static unsigned decode_sessionid(struct xdr_stream *xdr,
p = read_buf(xdr, len);
if (unlikely(p == NULL))
- return htonl(NFS4ERR_RESOURCE);;
+ return htonl(NFS4ERR_RESOURCE);
memcpy(sid->data, p, len);
return 0;
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 8d25ccb2d51d..a7ce15d3c248 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -809,6 +809,9 @@ static int nfs_init_server(struct nfs_server *server,
/* Initialise the client representation from the mount data */
server->flags = data->flags;
server->options = data->options;
+ server->caps |= NFS_CAP_HARDLINKS|NFS_CAP_SYMLINKS|NFS_CAP_FILEID|
+ NFS_CAP_MODE|NFS_CAP_NLINK|NFS_CAP_OWNER|NFS_CAP_OWNER_GROUP|
+ NFS_CAP_ATIME|NFS_CAP_CTIME|NFS_CAP_MTIME;
if (data->rsize)
server->rsize = nfs_block_size(data->rsize, NULL);
@@ -879,6 +882,7 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo *
server->rsize = NFS_MAX_FILE_IO_SIZE;
server->rpages = (server->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ server->backing_dev_info.name = "nfs";
server->backing_dev_info.ra_pages = server->rpages * NFS_MAX_READAHEAD;
if (server->wsize > max_rpc_payload)
@@ -929,10 +933,6 @@ static int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, str
goto out_error;
nfs_server_set_fsinfo(server, &fsinfo);
- error = bdi_init(&server->backing_dev_info);
- if (error)
- goto out_error;
-
/* Get some general file system info */
if (server->namelen == 0) {
@@ -991,6 +991,12 @@ static struct nfs_server *nfs_alloc_server(void)
return NULL;
}
+ if (bdi_init(&server->backing_dev_info)) {
+ nfs_free_iostats(server->io_stats);
+ kfree(server);
+ return NULL;
+ }
+
return server;
}
@@ -1074,10 +1080,6 @@ struct nfs_server *nfs_create_server(const struct nfs_parsed_mount_data *data,
(unsigned long long) server->fsid.major,
(unsigned long long) server->fsid.minor);
- BUG_ON(!server->nfs_client);
- BUG_ON(!server->nfs_client->rpc_ops);
- BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops);
-
spin_lock(&nfs_client_lock);
list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks);
list_add_tail(&server->master_link, &nfs_volume_list);
@@ -1274,7 +1276,7 @@ static int nfs4_init_server(struct nfs_server *server,
/* Initialise the client representation from the mount data */
server->flags = data->flags;
- server->caps |= NFS_CAP_ATOMIC_OPEN;
+ server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR;
server->options = data->options;
/* Get a client record */
@@ -1359,10 +1361,6 @@ struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data,
if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN)
server->namelen = NFS4_MAXNAMLEN;
- BUG_ON(!server->nfs_client);
- BUG_ON(!server->nfs_client->rpc_ops);
- BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops);
-
spin_lock(&nfs_client_lock);
list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks);
list_add_tail(&server->master_link, &nfs_volume_list);
@@ -1400,7 +1398,7 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
/* Initialise the client representation from the parent server */
nfs_server_copy_userdata(server, parent_server);
- server->caps |= NFS_CAP_ATOMIC_OPEN;
+ server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR;
/* Get a client representation.
* Note: NFSv4 always uses TCP, */
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 489fc01a3204..6c3210099d51 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -255,7 +255,7 @@ static void nfs_direct_read_release(void *calldata)
if (put_dreq(dreq))
nfs_direct_complete(dreq);
- nfs_readdata_release(calldata);
+ nfs_readdata_free(data);
}
static const struct rpc_call_ops nfs_read_direct_ops = {
@@ -314,14 +314,14 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq,
data->npages, 1, 0, data->pagevec, NULL);
up_read(&current->mm->mmap_sem);
if (result < 0) {
- nfs_readdata_release(data);
+ nfs_readdata_free(data);
break;
}
if ((unsigned)result < data->npages) {
bytes = result * PAGE_SIZE;
if (bytes <= pgbase) {
nfs_direct_release_pages(data->pagevec, result);
- nfs_readdata_release(data);
+ nfs_readdata_free(data);
break;
}
bytes -= pgbase;
@@ -334,7 +334,7 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq,
data->inode = inode;
data->cred = msg.rpc_cred;
data->args.fh = NFS_FH(inode);
- data->args.context = get_nfs_open_context(ctx);
+ data->args.context = ctx;
data->args.offset = pos;
data->args.pgbase = pgbase;
data->args.pages = data->pagevec;
@@ -441,7 +441,7 @@ static void nfs_direct_free_writedata(struct nfs_direct_req *dreq)
struct nfs_write_data *data = list_entry(dreq->rewrite_list.next, struct nfs_write_data, pages);
list_del(&data->pages);
nfs_direct_release_pages(data->pagevec, data->npages);
- nfs_writedata_release(data);
+ nfs_writedata_free(data);
}
}
@@ -534,7 +534,7 @@ static void nfs_direct_commit_release(void *calldata)
dprintk("NFS: %5u commit returned %d\n", data->task.tk_pid, status);
nfs_direct_write_complete(dreq, data->inode);
- nfs_commitdata_release(calldata);
+ nfs_commit_free(data);
}
static const struct rpc_call_ops nfs_commit_direct_ops = {
@@ -570,7 +570,7 @@ static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
data->args.fh = NFS_FH(data->inode);
data->args.offset = 0;
data->args.count = 0;
- data->args.context = get_nfs_open_context(dreq->ctx);
+ data->args.context = dreq->ctx;
data->res.count = 0;
data->res.fattr = &data->fattr;
data->res.verf = &data->verf;
@@ -734,14 +734,14 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq,
data->npages, 0, 0, data->pagevec, NULL);
up_read(&current->mm->mmap_sem);
if (result < 0) {
- nfs_writedata_release(data);
+ nfs_writedata_free(data);
break;
}
if ((unsigned)result < data->npages) {
bytes = result * PAGE_SIZE;
if (bytes <= pgbase) {
nfs_direct_release_pages(data->pagevec, result);
- nfs_writedata_release(data);
+ nfs_writedata_free(data);
break;
}
bytes -= pgbase;
@@ -756,7 +756,7 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq,
data->inode = inode;
data->cred = msg.rpc_cred;
data->args.fh = NFS_FH(inode);
- data->args.context = get_nfs_open_context(ctx);
+ data->args.context = ctx;
data->args.offset = pos;
data->args.pgbase = pgbase;
data->args.pages = data->pagevec;
@@ -934,9 +934,6 @@ out:
* back into its cache. We let the server do generic write
* parameter checking and report problems.
*
- * We also avoid an unnecessary invocation of generic_osync_inode(),
- * as it is fairly meaningless to sync the metadata of an NFS file.
- *
* We eliminate local atime updates, see direct read above.
*
* We avoid unnecessary page cache invalidations for normal cached
diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c
new file mode 100644
index 000000000000..f4d54ba97cc6
--- /dev/null
+++ b/fs/nfs/dns_resolve.c
@@ -0,0 +1,335 @@
+/*
+ * linux/fs/nfs/dns_resolve.c
+ *
+ * Copyright (c) 2009 Trond Myklebust <Trond.Myklebust@netapp.com>
+ *
+ * Resolves DNS hostnames into valid ip addresses
+ */
+
+#include <linux/hash.h>
+#include <linux/string.h>
+#include <linux/kmod.h>
+#include <linux/module.h>
+#include <linux/socket.h>
+#include <linux/seq_file.h>
+#include <linux/inet.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/cache.h>
+#include <linux/sunrpc/svcauth.h>
+
+#include "dns_resolve.h"
+#include "cache_lib.h"
+
+#define NFS_DNS_HASHBITS 4
+#define NFS_DNS_HASHTBL_SIZE (1 << NFS_DNS_HASHBITS)
+
+static struct cache_head *nfs_dns_table[NFS_DNS_HASHTBL_SIZE];
+
+struct nfs_dns_ent {
+ struct cache_head h;
+
+ char *hostname;
+ size_t namelen;
+
+ struct sockaddr_storage addr;
+ size_t addrlen;
+};
+
+
+static void nfs_dns_ent_init(struct cache_head *cnew,
+ struct cache_head *ckey)
+{
+ struct nfs_dns_ent *new;
+ struct nfs_dns_ent *key;
+
+ new = container_of(cnew, struct nfs_dns_ent, h);
+ key = container_of(ckey, struct nfs_dns_ent, h);
+
+ kfree(new->hostname);
+ new->hostname = kstrndup(key->hostname, key->namelen, GFP_KERNEL);
+ if (new->hostname) {
+ new->namelen = key->namelen;
+ memcpy(&new->addr, &key->addr, key->addrlen);
+ new->addrlen = key->addrlen;
+ } else {
+ new->namelen = 0;
+ new->addrlen = 0;
+ }
+}
+
+static void nfs_dns_ent_put(struct kref *ref)
+{
+ struct nfs_dns_ent *item;
+
+ item = container_of(ref, struct nfs_dns_ent, h.ref);
+ kfree(item->hostname);
+ kfree(item);
+}
+
+static struct cache_head *nfs_dns_ent_alloc(void)
+{
+ struct nfs_dns_ent *item = kmalloc(sizeof(*item), GFP_KERNEL);
+
+ if (item != NULL) {
+ item->hostname = NULL;
+ item->namelen = 0;
+ item->addrlen = 0;
+ return &item->h;
+ }
+ return NULL;
+};
+
+static unsigned int nfs_dns_hash(const struct nfs_dns_ent *key)
+{
+ return hash_str(key->hostname, NFS_DNS_HASHBITS);
+}
+
+static void nfs_dns_request(struct cache_detail *cd,
+ struct cache_head *ch,
+ char **bpp, int *blen)
+{
+ struct nfs_dns_ent *key = container_of(ch, struct nfs_dns_ent, h);
+
+ qword_add(bpp, blen, key->hostname);
+ (*bpp)[-1] = '\n';
+}
+
+static int nfs_dns_upcall(struct cache_detail *cd,
+ struct cache_head *ch)
+{
+ struct nfs_dns_ent *key = container_of(ch, struct nfs_dns_ent, h);
+ int ret;
+
+ ret = nfs_cache_upcall(cd, key->hostname);
+ if (ret)
+ ret = sunrpc_cache_pipe_upcall(cd, ch, nfs_dns_request);
+ return ret;
+}
+
+static int nfs_dns_match(struct cache_head *ca,
+ struct cache_head *cb)
+{
+ struct nfs_dns_ent *a;
+ struct nfs_dns_ent *b;
+
+ a = container_of(ca, struct nfs_dns_ent, h);
+ b = container_of(cb, struct nfs_dns_ent, h);
+
+ if (a->namelen == 0 || a->namelen != b->namelen)
+ return 0;
+ return memcmp(a->hostname, b->hostname, a->namelen) == 0;
+}
+
+static int nfs_dns_show(struct seq_file *m, struct cache_detail *cd,
+ struct cache_head *h)
+{
+ struct nfs_dns_ent *item;
+ long ttl;
+
+ if (h == NULL) {
+ seq_puts(m, "# ip address hostname ttl\n");
+ return 0;
+ }
+ item = container_of(h, struct nfs_dns_ent, h);
+ ttl = (long)item->h.expiry_time - (long)get_seconds();
+ if (ttl < 0)
+ ttl = 0;
+
+ if (!test_bit(CACHE_NEGATIVE, &h->flags)) {
+ char buf[INET6_ADDRSTRLEN+IPV6_SCOPE_ID_LEN+1];
+
+ rpc_ntop((struct sockaddr *)&item->addr, buf, sizeof(buf));
+ seq_printf(m, "%15s ", buf);
+ } else
+ seq_puts(m, "<none> ");
+ seq_printf(m, "%15s %ld\n", item->hostname, ttl);
+ return 0;
+}
+
+struct nfs_dns_ent *nfs_dns_lookup(struct cache_detail *cd,
+ struct nfs_dns_ent *key)
+{
+ struct cache_head *ch;
+
+ ch = sunrpc_cache_lookup(cd,
+ &key->h,
+ nfs_dns_hash(key));
+ if (!ch)
+ return NULL;
+ return container_of(ch, struct nfs_dns_ent, h);
+}
+
+struct nfs_dns_ent *nfs_dns_update(struct cache_detail *cd,
+ struct nfs_dns_ent *new,
+ struct nfs_dns_ent *key)
+{
+ struct cache_head *ch;
+
+ ch = sunrpc_cache_update(cd,
+ &new->h, &key->h,
+ nfs_dns_hash(key));
+ if (!ch)
+ return NULL;
+ return container_of(ch, struct nfs_dns_ent, h);
+}
+
+static int nfs_dns_parse(struct cache_detail *cd, char *buf, int buflen)
+{
+ char buf1[NFS_DNS_HOSTNAME_MAXLEN+1];
+ struct nfs_dns_ent key, *item;
+ unsigned long ttl;
+ ssize_t len;
+ int ret = -EINVAL;
+
+ if (buf[buflen-1] != '\n')
+ goto out;
+ buf[buflen-1] = '\0';
+
+ len = qword_get(&buf, buf1, sizeof(buf1));
+ if (len <= 0)
+ goto out;
+ key.addrlen = rpc_pton(buf1, len,
+ (struct sockaddr *)&key.addr,
+ sizeof(key.addr));
+
+ len = qword_get(&buf, buf1, sizeof(buf1));
+ if (len <= 0)
+ goto out;
+
+ key.hostname = buf1;
+ key.namelen = len;
+ memset(&key.h, 0, sizeof(key.h));
+
+ ttl = get_expiry(&buf);
+ if (ttl == 0)
+ goto out;
+ key.h.expiry_time = ttl + get_seconds();
+
+ ret = -ENOMEM;
+ item = nfs_dns_lookup(cd, &key);
+ if (item == NULL)
+ goto out;
+
+ if (key.addrlen == 0)
+ set_bit(CACHE_NEGATIVE, &key.h.flags);
+
+ item = nfs_dns_update(cd, &key, item);
+ if (item == NULL)
+ goto out;
+
+ ret = 0;
+ cache_put(&item->h, cd);
+out:
+ return ret;
+}
+
+static struct cache_detail nfs_dns_resolve = {
+ .owner = THIS_MODULE,
+ .hash_size = NFS_DNS_HASHTBL_SIZE,
+ .hash_table = nfs_dns_table,
+ .name = "dns_resolve",
+ .cache_put = nfs_dns_ent_put,
+ .cache_upcall = nfs_dns_upcall,
+ .cache_parse = nfs_dns_parse,
+ .cache_show = nfs_dns_show,
+ .match = nfs_dns_match,
+ .init = nfs_dns_ent_init,
+ .update = nfs_dns_ent_init,
+ .alloc = nfs_dns_ent_alloc,
+};
+
+static int do_cache_lookup(struct cache_detail *cd,
+ struct nfs_dns_ent *key,
+ struct nfs_dns_ent **item,
+ struct nfs_cache_defer_req *dreq)
+{
+ int ret = -ENOMEM;
+
+ *item = nfs_dns_lookup(cd, key);
+ if (*item) {
+ ret = cache_check(cd, &(*item)->h, &dreq->req);
+ if (ret)
+ *item = NULL;
+ }
+ return ret;
+}
+
+static int do_cache_lookup_nowait(struct cache_detail *cd,
+ struct nfs_dns_ent *key,
+ struct nfs_dns_ent **item)
+{
+ int ret = -ENOMEM;
+
+ *item = nfs_dns_lookup(cd, key);
+ if (!*item)
+ goto out_err;
+ ret = -ETIMEDOUT;
+ if (!test_bit(CACHE_VALID, &(*item)->h.flags)
+ || (*item)->h.expiry_time < get_seconds()
+ || cd->flush_time > (*item)->h.last_refresh)
+ goto out_put;
+ ret = -ENOENT;
+ if (test_bit(CACHE_NEGATIVE, &(*item)->h.flags))
+ goto out_put;
+ return 0;
+out_put:
+ cache_put(&(*item)->h, cd);
+out_err:
+ *item = NULL;
+ return ret;
+}
+
+static int do_cache_lookup_wait(struct cache_detail *cd,
+ struct nfs_dns_ent *key,
+ struct nfs_dns_ent **item)
+{
+ struct nfs_cache_defer_req *dreq;
+ int ret = -ENOMEM;
+
+ dreq = nfs_cache_defer_req_alloc();
+ if (!dreq)
+ goto out;
+ ret = do_cache_lookup(cd, key, item, dreq);
+ if (ret == -EAGAIN) {
+ ret = nfs_cache_wait_for_upcall(dreq);
+ if (!ret)
+ ret = do_cache_lookup_nowait(cd, key, item);
+ }
+ nfs_cache_defer_req_put(dreq);
+out:
+ return ret;
+}
+
+ssize_t nfs_dns_resolve_name(char *name, size_t namelen,
+ struct sockaddr *sa, size_t salen)
+{
+ struct nfs_dns_ent key = {
+ .hostname = name,
+ .namelen = namelen,
+ };
+ struct nfs_dns_ent *item = NULL;
+ ssize_t ret;
+
+ ret = do_cache_lookup_wait(&nfs_dns_resolve, &key, &item);
+ if (ret == 0) {
+ if (salen >= item->addrlen) {
+ memcpy(sa, &item->addr, item->addrlen);
+ ret = item->addrlen;
+ } else
+ ret = -EOVERFLOW;
+ cache_put(&item->h, &nfs_dns_resolve);
+ } else if (ret == -ENOENT)
+ ret = -ESRCH;
+ return ret;
+}
+
+int nfs_dns_resolver_init(void)
+{
+ return nfs_cache_register(&nfs_dns_resolve);
+}
+
+void nfs_dns_resolver_destroy(void)
+{
+ nfs_cache_unregister(&nfs_dns_resolve);
+}
+
diff --git a/fs/nfs/dns_resolve.h b/fs/nfs/dns_resolve.h
new file mode 100644
index 000000000000..a3f0938babf7
--- /dev/null
+++ b/fs/nfs/dns_resolve.h
@@ -0,0 +1,14 @@
+/*
+ * Resolve DNS hostnames into valid ip addresses
+ */
+#ifndef __LINUX_FS_NFS_DNS_RESOLVE_H
+#define __LINUX_FS_NFS_DNS_RESOLVE_H
+
+#define NFS_DNS_HOSTNAME_MAXLEN (128)
+
+extern int nfs_dns_resolver_init(void);
+extern void nfs_dns_resolver_destroy(void);
+extern ssize_t nfs_dns_resolve_name(char *name, size_t namelen,
+ struct sockaddr *sa, size_t salen);
+
+#endif
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 05062329b678..5021b75d2d1e 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -328,6 +328,42 @@ nfs_file_fsync(struct file *file, struct dentry *dentry, int datasync)
}
/*
+ * Decide whether a read/modify/write cycle may be more efficient
+ * then a modify/write/read cycle when writing to a page in the
+ * page cache.
+ *
+ * The modify/write/read cycle may occur if a page is read before
+ * being completely filled by the writer. In this situation, the
+ * page must be completely written to stable storage on the server
+ * before it can be refilled by reading in the page from the server.
+ * This can lead to expensive, small, FILE_SYNC mode writes being
+ * done.
+ *
+ * It may be more efficient to read the page first if the file is
+ * open for reading in addition to writing, the page is not marked
+ * as Uptodate, it is not dirty or waiting to be committed,
+ * indicating that it was previously allocated and then modified,
+ * that there were valid bytes of data in that range of the file,
+ * and that the new data won't completely replace the old data in
+ * that range of the file.
+ */
+static int nfs_want_read_modify_write(struct file *file, struct page *page,
+ loff_t pos, unsigned len)
+{
+ unsigned int pglen = nfs_page_length(page);
+ unsigned int offset = pos & (PAGE_CACHE_SIZE - 1);
+ unsigned int end = offset + len;
+
+ if ((file->f_mode & FMODE_READ) && /* open for read? */
+ !PageUptodate(page) && /* Uptodate? */
+ !PagePrivate(page) && /* i/o request already? */
+ pglen && /* valid bytes of file? */
+ (end < pglen || offset)) /* replace all valid bytes? */
+ return 1;
+ return 0;
+}
+
+/*
* This does the "real" work of the write. We must allocate and lock the
* page to be sent back to the generic routine, which then copies the
* data from user space.
@@ -340,15 +376,16 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
struct page **pagep, void **fsdata)
{
int ret;
- pgoff_t index;
+ pgoff_t index = pos >> PAGE_CACHE_SHIFT;
struct page *page;
- index = pos >> PAGE_CACHE_SHIFT;
+ int once_thru = 0;
dfprintk(PAGECACHE, "NFS: write_begin(%s/%s(%ld), %u@%lld)\n",
file->f_path.dentry->d_parent->d_name.name,
file->f_path.dentry->d_name.name,
mapping->host->i_ino, len, (long long) pos);
+start:
/*
* Prevent starvation issues if someone is doing a consistency
* sync-to-disk
@@ -367,6 +404,13 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
if (ret) {
unlock_page(page);
page_cache_release(page);
+ } else if (!once_thru &&
+ nfs_want_read_modify_write(file, page, pos, len)) {
+ once_thru = 1;
+ ret = nfs_readpage(file, page);
+ page_cache_release(page);
+ if (!ret)
+ goto start;
}
return ret;
}
@@ -479,6 +523,7 @@ const struct address_space_operations nfs_file_aops = {
.invalidatepage = nfs_invalidate_page,
.releasepage = nfs_release_page,
.direct_IO = nfs_direct_IO,
+ .migratepage = nfs_migrate_page,
.launder_page = nfs_launder_page,
};
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index 86147b0ab2cf..21a84d45916f 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -101,7 +101,7 @@ static void idmap_pipe_destroy_msg(struct rpc_pipe_msg *);
static unsigned int fnvhash32(const void *, size_t);
-static struct rpc_pipe_ops idmap_upcall_ops = {
+static const struct rpc_pipe_ops idmap_upcall_ops = {
.upcall = idmap_pipe_upcall,
.downcall = idmap_pipe_downcall,
.destroy_msg = idmap_pipe_destroy_msg,
@@ -119,8 +119,8 @@ nfs_idmap_new(struct nfs_client *clp)
if (idmap == NULL)
return -ENOMEM;
- idmap->idmap_dentry = rpc_mkpipe(clp->cl_rpcclient->cl_dentry, "idmap",
- idmap, &idmap_upcall_ops, 0);
+ idmap->idmap_dentry = rpc_mkpipe(clp->cl_rpcclient->cl_path.dentry,
+ "idmap", idmap, &idmap_upcall_ops, 0);
if (IS_ERR(idmap->idmap_dentry)) {
error = PTR_ERR(idmap->idmap_dentry);
kfree(idmap);
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index bd7938eda6a8..060022b4651c 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -46,6 +46,7 @@
#include "iostat.h"
#include "internal.h"
#include "fscache.h"
+#include "dns_resolve.h"
#define NFSDBG_FACILITY NFSDBG_VFS
@@ -286,6 +287,11 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
/* We can't support update_atime(), since the server will reset it */
inode->i_flags |= S_NOATIME|S_NOCMTIME;
inode->i_mode = fattr->mode;
+ if ((fattr->valid & NFS_ATTR_FATTR_MODE) == 0
+ && nfs_server_capable(inode, NFS_CAP_MODE))
+ nfsi->cache_validity |= NFS_INO_INVALID_ATTR
+ | NFS_INO_INVALID_ACCESS
+ | NFS_INO_INVALID_ACL;
/* Why so? Because we want revalidate for devices/FIFOs, and
* that's precisely what we have in nfs_file_inode_operations.
*/
@@ -330,20 +336,46 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
nfsi->attr_gencount = fattr->gencount;
if (fattr->valid & NFS_ATTR_FATTR_ATIME)
inode->i_atime = fattr->atime;
+ else if (nfs_server_capable(inode, NFS_CAP_ATIME))
+ nfsi->cache_validity |= NFS_INO_INVALID_ATTR;
if (fattr->valid & NFS_ATTR_FATTR_MTIME)
inode->i_mtime = fattr->mtime;
+ else if (nfs_server_capable(inode, NFS_CAP_MTIME))
+ nfsi->cache_validity |= NFS_INO_INVALID_ATTR
+ | NFS_INO_INVALID_DATA;
if (fattr->valid & NFS_ATTR_FATTR_CTIME)
inode->i_ctime = fattr->ctime;
+ else if (nfs_server_capable(inode, NFS_CAP_CTIME))
+ nfsi->cache_validity |= NFS_INO_INVALID_ATTR
+ | NFS_INO_INVALID_ACCESS
+ | NFS_INO_INVALID_ACL;
if (fattr->valid & NFS_ATTR_FATTR_CHANGE)
nfsi->change_attr = fattr->change_attr;
+ else if (nfs_server_capable(inode, NFS_CAP_CHANGE_ATTR))
+ nfsi->cache_validity |= NFS_INO_INVALID_ATTR
+ | NFS_INO_INVALID_DATA;
if (fattr->valid & NFS_ATTR_FATTR_SIZE)
inode->i_size = nfs_size_to_loff_t(fattr->size);
+ else
+ nfsi->cache_validity |= NFS_INO_INVALID_ATTR
+ | NFS_INO_INVALID_DATA
+ | NFS_INO_REVAL_PAGECACHE;
if (fattr->valid & NFS_ATTR_FATTR_NLINK)
inode->i_nlink = fattr->nlink;
+ else if (nfs_server_capable(inode, NFS_CAP_NLINK))
+ nfsi->cache_validity |= NFS_INO_INVALID_ATTR;
if (fattr->valid & NFS_ATTR_FATTR_OWNER)
inode->i_uid = fattr->uid;
+ else if (nfs_server_capable(inode, NFS_CAP_OWNER))
+ nfsi->cache_validity |= NFS_INO_INVALID_ATTR
+ | NFS_INO_INVALID_ACCESS
+ | NFS_INO_INVALID_ACL;
if (fattr->valid & NFS_ATTR_FATTR_GROUP)
inode->i_gid = fattr->gid;
+ else if (nfs_server_capable(inode, NFS_CAP_OWNER_GROUP))
+ nfsi->cache_validity |= NFS_INO_INVALID_ATTR
+ | NFS_INO_INVALID_ACCESS
+ | NFS_INO_INVALID_ACL;
if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED)
inode->i_blocks = fattr->du.nfs2.blocks;
if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) {
@@ -1145,6 +1177,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
loff_t cur_isize, new_isize;
unsigned long invalid = 0;
unsigned long now = jiffies;
+ unsigned long save_cache_validity;
dfprintk(VFS, "NFS: %s(%s/%ld ct=%d info=0x%x)\n",
__func__, inode->i_sb->s_id, inode->i_ino,
@@ -1171,10 +1204,11 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
*/
nfsi->read_cache_jiffies = fattr->time_start;
- if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) || (fattr->valid & (NFS_ATTR_FATTR_MTIME|NFS_ATTR_FATTR_CTIME)))
- nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR
- | NFS_INO_INVALID_ATIME
- | NFS_INO_REVAL_PAGECACHE);
+ save_cache_validity = nfsi->cache_validity;
+ nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR
+ | NFS_INO_INVALID_ATIME
+ | NFS_INO_REVAL_FORCED
+ | NFS_INO_REVAL_PAGECACHE);
/* Do atomic weak cache consistency updates */
nfs_wcc_update_inode(inode, fattr);
@@ -1189,7 +1223,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
nfs_force_lookup_revalidate(inode);
nfsi->change_attr = fattr->change_attr;
}
- }
+ } else if (server->caps & NFS_CAP_CHANGE_ATTR)
+ invalid |= save_cache_validity;
if (fattr->valid & NFS_ATTR_FATTR_MTIME) {
/* NFSv2/v3: Check if the mtime agrees */
@@ -1201,7 +1236,12 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
nfs_force_lookup_revalidate(inode);
memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
}
- }
+ } else if (server->caps & NFS_CAP_MTIME)
+ invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
+ | NFS_INO_INVALID_DATA
+ | NFS_INO_REVAL_PAGECACHE
+ | NFS_INO_REVAL_FORCED);
+
if (fattr->valid & NFS_ATTR_FATTR_CTIME) {
/* If ctime has changed we should definitely clear access+acl caches */
if (!timespec_equal(&inode->i_ctime, &fattr->ctime)) {
@@ -1215,7 +1255,11 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
}
memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
}
- }
+ } else if (server->caps & NFS_CAP_CTIME)
+ invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
+ | NFS_INO_INVALID_ACCESS
+ | NFS_INO_INVALID_ACL
+ | NFS_INO_REVAL_FORCED);
/* Check if our cached file size is stale */
if (fattr->valid & NFS_ATTR_FATTR_SIZE) {
@@ -1231,30 +1275,50 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
dprintk("NFS: isize change on server for file %s/%ld\n",
inode->i_sb->s_id, inode->i_ino);
}
- }
+ } else
+ invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
+ | NFS_INO_REVAL_PAGECACHE
+ | NFS_INO_REVAL_FORCED);
if (fattr->valid & NFS_ATTR_FATTR_ATIME)
memcpy(&inode->i_atime, &fattr->atime, sizeof(inode->i_atime));
+ else if (server->caps & NFS_CAP_ATIME)
+ invalid |= save_cache_validity & (NFS_INO_INVALID_ATIME
+ | NFS_INO_REVAL_FORCED);
if (fattr->valid & NFS_ATTR_FATTR_MODE) {
if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) {
invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
inode->i_mode = fattr->mode;
}
- }
+ } else if (server->caps & NFS_CAP_MODE)
+ invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
+ | NFS_INO_INVALID_ACCESS
+ | NFS_INO_INVALID_ACL
+ | NFS_INO_REVAL_FORCED);
+
if (fattr->valid & NFS_ATTR_FATTR_OWNER) {
if (inode->i_uid != fattr->uid) {
invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
inode->i_uid = fattr->uid;
}
- }
+ } else if (server->caps & NFS_CAP_OWNER)
+ invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
+ | NFS_INO_INVALID_ACCESS
+ | NFS_INO_INVALID_ACL
+ | NFS_INO_REVAL_FORCED);
+
if (fattr->valid & NFS_ATTR_FATTR_GROUP) {
if (inode->i_gid != fattr->gid) {
invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
inode->i_gid = fattr->gid;
}
- }
+ } else if (server->caps & NFS_CAP_OWNER_GROUP)
+ invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
+ | NFS_INO_INVALID_ACCESS
+ | NFS_INO_INVALID_ACL
+ | NFS_INO_REVAL_FORCED);
if (fattr->valid & NFS_ATTR_FATTR_NLINK) {
if (inode->i_nlink != fattr->nlink) {
@@ -1263,7 +1327,9 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
invalid |= NFS_INO_INVALID_DATA;
inode->i_nlink = fattr->nlink;
}
- }
+ } else if (server->caps & NFS_CAP_NLINK)
+ invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
+ | NFS_INO_REVAL_FORCED);
if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) {
/*
@@ -1293,9 +1359,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
|| S_ISLNK(inode->i_mode)))
invalid &= ~NFS_INO_INVALID_DATA;
if (!nfs_have_delegation(inode, FMODE_READ) ||
- (nfsi->cache_validity & NFS_INO_REVAL_FORCED))
+ (save_cache_validity & NFS_INO_REVAL_FORCED))
nfsi->cache_validity |= invalid;
- nfsi->cache_validity &= ~NFS_INO_REVAL_FORCED;
return 0;
out_changed:
@@ -1442,6 +1507,10 @@ static int __init init_nfs_fs(void)
{
int err;
+ err = nfs_dns_resolver_init();
+ if (err < 0)
+ goto out8;
+
err = nfs_fscache_register();
if (err < 0)
goto out7;
@@ -1500,6 +1569,8 @@ out5:
out6:
nfs_fscache_unregister();
out7:
+ nfs_dns_resolver_destroy();
+out8:
return err;
}
@@ -1511,6 +1582,7 @@ static void __exit exit_nfs_fs(void)
nfs_destroy_inodecache();
nfs_destroy_nfspagecache();
nfs_fscache_unregister();
+ nfs_dns_resolver_destroy();
#ifdef CONFIG_PROC_FS
rpc_proc_unregister("nfs");
#endif
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 7dd90a6769d0..e21b1bb9972f 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -49,6 +49,11 @@ struct nfs_clone_mount {
#define NFS_MAX_SECFLAVORS (12)
/*
+ * Value used if the user did not specify a port value.
+ */
+#define NFS_UNSPEC_PORT (-1)
+
+/*
* In-kernel mount arguments
*/
struct nfs_parsed_mount_data {
@@ -63,6 +68,7 @@ struct nfs_parsed_mount_data {
unsigned int auth_flavor_len;
rpc_authflavor_t auth_flavors[1];
char *client_address;
+ unsigned int version;
unsigned int minorversion;
char *fscache_uniq;
@@ -71,7 +77,7 @@ struct nfs_parsed_mount_data {
size_t addrlen;
char *hostname;
u32 version;
- unsigned short port;
+ int port;
unsigned short protocol;
} mount_server;
@@ -80,7 +86,7 @@ struct nfs_parsed_mount_data {
size_t addrlen;
char *hostname;
char *export_path;
- unsigned short port;
+ int port;
unsigned short protocol;
} nfs_server;
@@ -102,6 +108,7 @@ struct nfs_mount_request {
};
extern int nfs_mount(struct nfs_mount_request *info);
+extern void nfs_umount(const struct nfs_mount_request *info);
/* client.c */
extern struct rpc_program nfs_program;
@@ -213,7 +220,6 @@ void nfs_zap_acl_cache(struct inode *inode);
extern int nfs_wait_bit_killable(void *word);
/* super.c */
-void nfs_parse_ip_address(char *, size_t, struct sockaddr *, size_t *);
extern struct file_system_type nfs_xdev_fs_type;
#ifdef CONFIG_NFS_V4
extern struct file_system_type nfs4_xdev_fs_type;
@@ -248,6 +254,12 @@ extern void nfs_read_prepare(struct rpc_task *task, void *calldata);
/* write.c */
extern void nfs_write_prepare(struct rpc_task *task, void *calldata);
+#ifdef CONFIG_MIGRATION
+extern int nfs_migrate_page(struct address_space *,
+ struct page *, struct page *);
+#else
+#define nfs_migrate_page NULL
+#endif
/* nfs4proc.c */
extern int _nfs4_call_sync(struct nfs_server *server,
@@ -368,24 +380,3 @@ unsigned int nfs_page_array_len(unsigned int base, size_t len)
return ((unsigned long)len + (unsigned long)base +
PAGE_SIZE - 1) >> PAGE_SHIFT;
}
-
-#define IPV6_SCOPE_DELIMITER '%'
-
-/*
- * Set the port number in an address. Be agnostic about the address
- * family.
- */
-static inline void nfs_set_port(struct sockaddr *sap, unsigned short port)
-{
- struct sockaddr_in *ap = (struct sockaddr_in *)sap;
- struct sockaddr_in6 *ap6 = (struct sockaddr_in6 *)sap;
-
- switch (sap->sa_family) {
- case AF_INET:
- ap->sin_port = htons(port);
- break;
- case AF_INET6:
- ap6->sin6_port = htons(port);
- break;
- }
-}
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index 38ef9eaec407..0adefc40cc89 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -209,6 +209,71 @@ out_mnt_err:
goto out;
}
+/**
+ * nfs_umount - Notify a server that we have unmounted this export
+ * @info: pointer to umount request arguments
+ *
+ * MOUNTPROC_UMNT is advisory, so we set a short timeout, and always
+ * use UDP.
+ */
+void nfs_umount(const struct nfs_mount_request *info)
+{
+ static const struct rpc_timeout nfs_umnt_timeout = {
+ .to_initval = 1 * HZ,
+ .to_maxval = 3 * HZ,
+ .to_retries = 2,
+ };
+ struct rpc_create_args args = {
+ .protocol = IPPROTO_UDP,
+ .address = info->sap,
+ .addrsize = info->salen,
+ .timeout = &nfs_umnt_timeout,
+ .servername = info->hostname,
+ .program = &mnt_program,
+ .version = info->version,
+ .authflavor = RPC_AUTH_UNIX,
+ .flags = RPC_CLNT_CREATE_NOPING,
+ };
+ struct mountres result;
+ struct rpc_message msg = {
+ .rpc_argp = info->dirpath,
+ .rpc_resp = &result,
+ };
+ struct rpc_clnt *clnt;
+ int status;
+
+ if (info->noresvport)
+ args.flags |= RPC_CLNT_CREATE_NONPRIVPORT;
+
+ clnt = rpc_create(&args);
+ if (unlikely(IS_ERR(clnt)))
+ goto out_clnt_err;
+
+ dprintk("NFS: sending UMNT request for %s:%s\n",
+ (info->hostname ? info->hostname : "server"), info->dirpath);
+
+ if (info->version == NFS_MNT3_VERSION)
+ msg.rpc_proc = &clnt->cl_procinfo[MOUNTPROC3_UMNT];
+ else
+ msg.rpc_proc = &clnt->cl_procinfo[MOUNTPROC_UMNT];
+
+ status = rpc_call_sync(clnt, &msg, 0);
+ rpc_shutdown_client(clnt);
+
+ if (unlikely(status < 0))
+ goto out_call_err;
+
+ return;
+
+out_clnt_err:
+ dprintk("NFS: failed to create UMNT RPC client, status=%ld\n",
+ PTR_ERR(clnt));
+ return;
+
+out_call_err:
+ dprintk("NFS: UMNT request failed, status=%d\n", status);
+}
+
/*
* XDR encode/decode functions for MOUNT
*/
@@ -258,7 +323,7 @@ static int decode_status(struct xdr_stream *xdr, struct mountres *res)
return -EIO;
status = ntohl(*p);
- for (i = 0; i <= ARRAY_SIZE(mnt_errtbl); i++) {
+ for (i = 0; i < ARRAY_SIZE(mnt_errtbl); i++) {
if (mnt_errtbl[i].status == status) {
res->errno = mnt_errtbl[i].errno;
return 0;
@@ -309,7 +374,7 @@ static int decode_fhs_status(struct xdr_stream *xdr, struct mountres *res)
return -EIO;
status = ntohl(*p);
- for (i = 0; i <= ARRAY_SIZE(mnt3_errtbl); i++) {
+ for (i = 0; i < ARRAY_SIZE(mnt3_errtbl); i++) {
if (mnt3_errtbl[i].status == status) {
res->errno = mnt3_errtbl[i].errno;
return 0;
@@ -407,6 +472,13 @@ static struct rpc_procinfo mnt_procedures[] = {
.p_statidx = MOUNTPROC_MNT,
.p_name = "MOUNT",
},
+ [MOUNTPROC_UMNT] = {
+ .p_proc = MOUNTPROC_UMNT,
+ .p_encode = (kxdrproc_t)mnt_enc_dirpath,
+ .p_arglen = MNT_enc_dirpath_sz,
+ .p_statidx = MOUNTPROC_UMNT,
+ .p_name = "UMOUNT",
+ },
};
static struct rpc_procinfo mnt3_procedures[] = {
@@ -419,6 +491,13 @@ static struct rpc_procinfo mnt3_procedures[] = {
.p_statidx = MOUNTPROC3_MNT,
.p_name = "MOUNT",
},
+ [MOUNTPROC3_UMNT] = {
+ .p_proc = MOUNTPROC3_UMNT,
+ .p_encode = (kxdrproc_t)mnt_enc_dirpath,
+ .p_arglen = MNT_enc_dirpath_sz,
+ .p_statidx = MOUNTPROC3_UMNT,
+ .p_name = "UMOUNT",
+ },
};
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index d0cc5ce0edfe..ee6a13f05443 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -299,7 +299,6 @@ static void nfs3_free_createdata(struct nfs3_createdata *data)
/*
* Create a regular file.
- * For now, we don't implement O_EXCL.
*/
static int
nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index 2a2a0a7143ad..2636c26d56fa 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -17,6 +17,7 @@
#include <linux/inet.h>
#include "internal.h"
#include "nfs4_fs.h"
+#include "dns_resolve.h"
#define NFSDBG_FACILITY NFSDBG_VFS
@@ -95,6 +96,20 @@ static int nfs4_validate_fspath(const struct vfsmount *mnt_parent,
return 0;
}
+static size_t nfs_parse_server_name(char *string, size_t len,
+ struct sockaddr *sa, size_t salen)
+{
+ ssize_t ret;
+
+ ret = rpc_pton(string, len, sa, salen);
+ if (ret == 0) {
+ ret = nfs_dns_resolve_name(string, len, sa, salen);
+ if (ret < 0)
+ ret = 0;
+ }
+ return ret;
+}
+
static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
char *page, char *page2,
const struct nfs4_fs_location *location)
@@ -121,11 +136,12 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
if (memchr(buf->data, IPV6_SCOPE_DELIMITER, buf->len))
continue;
- nfs_parse_ip_address(buf->data, buf->len,
- mountdata->addr, &mountdata->addrlen);
- if (mountdata->addr->sa_family == AF_UNSPEC)
+ mountdata->addrlen = nfs_parse_server_name(buf->data,
+ buf->len,
+ mountdata->addr, mountdata->addrlen);
+ if (mountdata->addrlen == 0)
continue;
- nfs_set_port(mountdata->addr, NFS_PORT);
+ rpc_set_port(mountdata->addr, NFS_PORT);
memcpy(page2, buf->data, buf->len);
page2[buf->len] = '\0';
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 6917311f201c..be6544aef41f 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -61,6 +61,8 @@
#define NFS4_POLL_RETRY_MIN (HZ/10)
#define NFS4_POLL_RETRY_MAX (15*HZ)
+#define NFS4_MAX_LOOP_ON_RECOVER (10)
+
struct nfs4_opendata;
static int _nfs4_proc_open(struct nfs4_opendata *data);
static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *);
@@ -426,17 +428,19 @@ out:
static int nfs4_recover_session(struct nfs4_session *session)
{
struct nfs_client *clp = session->clp;
+ unsigned int loop;
int ret;
- for (;;) {
+ for (loop = NFS4_MAX_LOOP_ON_RECOVER; loop != 0; loop--) {
ret = nfs4_wait_clnt_recover(clp);
if (ret != 0)
- return ret;
+ break;
if (!test_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state))
break;
nfs4_schedule_state_manager(clp);
+ ret = -EIO;
}
- return 0;
+ return ret;
}
static int nfs41_setup_sequence(struct nfs4_session *session,
@@ -1444,18 +1448,20 @@ static int _nfs4_proc_open(struct nfs4_opendata *data)
static int nfs4_recover_expired_lease(struct nfs_server *server)
{
struct nfs_client *clp = server->nfs_client;
+ unsigned int loop;
int ret;
- for (;;) {
+ for (loop = NFS4_MAX_LOOP_ON_RECOVER; loop != 0; loop--) {
ret = nfs4_wait_clnt_recover(clp);
if (ret != 0)
- return ret;
+ break;
if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) &&
!test_bit(NFS4CLNT_CHECK_LEASE,&clp->cl_state))
break;
nfs4_schedule_state_recovery(clp);
+ ret = -EIO;
}
- return 0;
+ return ret;
}
/*
@@ -1997,12 +2003,34 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
status = nfs4_call_sync(server, &msg, &args, &res, 0);
if (status == 0) {
memcpy(server->attr_bitmask, res.attr_bitmask, sizeof(server->attr_bitmask));
+ server->caps &= ~(NFS_CAP_ACLS|NFS_CAP_HARDLINKS|
+ NFS_CAP_SYMLINKS|NFS_CAP_FILEID|
+ NFS_CAP_MODE|NFS_CAP_NLINK|NFS_CAP_OWNER|
+ NFS_CAP_OWNER_GROUP|NFS_CAP_ATIME|
+ NFS_CAP_CTIME|NFS_CAP_MTIME);
if (res.attr_bitmask[0] & FATTR4_WORD0_ACL)
server->caps |= NFS_CAP_ACLS;
if (res.has_links != 0)
server->caps |= NFS_CAP_HARDLINKS;
if (res.has_symlinks != 0)
server->caps |= NFS_CAP_SYMLINKS;
+ if (res.attr_bitmask[0] & FATTR4_WORD0_FILEID)
+ server->caps |= NFS_CAP_FILEID;
+ if (res.attr_bitmask[1] & FATTR4_WORD1_MODE)
+ server->caps |= NFS_CAP_MODE;
+ if (res.attr_bitmask[1] & FATTR4_WORD1_NUMLINKS)
+ server->caps |= NFS_CAP_NLINK;
+ if (res.attr_bitmask[1] & FATTR4_WORD1_OWNER)
+ server->caps |= NFS_CAP_OWNER;
+ if (res.attr_bitmask[1] & FATTR4_WORD1_OWNER_GROUP)
+ server->caps |= NFS_CAP_OWNER_GROUP;
+ if (res.attr_bitmask[1] & FATTR4_WORD1_TIME_ACCESS)
+ server->caps |= NFS_CAP_ATIME;
+ if (res.attr_bitmask[1] & FATTR4_WORD1_TIME_METADATA)
+ server->caps |= NFS_CAP_CTIME;
+ if (res.attr_bitmask[1] & FATTR4_WORD1_TIME_MODIFY)
+ server->caps |= NFS_CAP_MTIME;
+
memcpy(server->cache_consistency_bitmask, res.attr_bitmask, sizeof(server->cache_consistency_bitmask));
server->cache_consistency_bitmask[0] &= FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE;
server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY;
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 65ca8c18476f..2ef4fecf3984 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -638,7 +638,7 @@ static void nfs4_fl_release_lock(struct file_lock *fl)
nfs4_put_lock_state(fl->fl_u.nfs4_fl.owner);
}
-static struct file_lock_operations nfs4_fl_lock_ops = {
+static const struct file_lock_operations nfs4_fl_lock_ops = {
.fl_copy_lock = nfs4_fl_copy_lock,
.fl_release_private = nfs4_fl_release_lock,
};
@@ -1250,8 +1250,8 @@ static void nfs4_state_manager(struct nfs_client *clp)
continue;
}
/* Initialize or reset the session */
- if (nfs4_has_session(clp) &&
- test_and_clear_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state)) {
+ if (test_and_clear_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state)
+ && nfs4_has_session(clp)) {
if (clp->cl_cons_state == NFS_CS_SESSION_INITING)
status = nfs4_initialize_session(clp);
else
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 617273e7d47f..cfc30d362f94 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -702,29 +702,12 @@ struct compound_hdr {
u32 minorversion;
};
-/*
- * START OF "GENERIC" ENCODE ROUTINES.
- * These may look a little ugly since they are imported from a "generic"
- * set of XDR encode/decode routines which are intended to be shared by
- * all of our NFSv4 implementations (OpenBSD, MacOS X...).
- *
- * If the pain of reading these is too great, it should be a straightforward
- * task to translate them into Linux-specific versions which are more
- * consistent with the style used in NFSv2/v3...
- */
-#define WRITE32(n) *p++ = htonl(n)
-#define WRITE64(n) do { \
- *p++ = htonl((uint32_t)((n) >> 32)); \
- *p++ = htonl((uint32_t)(n)); \
-} while (0)
-#define WRITEMEM(ptr,nbytes) do { \
- p = xdr_encode_opaque_fixed(p, ptr, nbytes); \
-} while (0)
-
-#define RESERVE_SPACE(nbytes) do { \
- p = xdr_reserve_space(xdr, nbytes); \
- BUG_ON(!p); \
-} while (0)
+static __be32 *reserve_space(struct xdr_stream *xdr, size_t nbytes)
+{
+ __be32 *p = xdr_reserve_space(xdr, nbytes);
+ BUG_ON(!p);
+ return p;
+}
static void encode_string(struct xdr_stream *xdr, unsigned int len, const char *str)
{
@@ -749,12 +732,11 @@ static void encode_compound_hdr(struct xdr_stream *xdr,
dprintk("encode_compound: tag=%.*s\n", (int)hdr->taglen, hdr->tag);
BUG_ON(hdr->taglen > NFS4_MAXTAGLEN);
- RESERVE_SPACE(12+(XDR_QUADLEN(hdr->taglen)<<2));
- WRITE32(hdr->taglen);
- WRITEMEM(hdr->tag, hdr->taglen);
- WRITE32(hdr->minorversion);
+ p = reserve_space(xdr, 4 + hdr->taglen + 8);
+ p = xdr_encode_opaque(p, hdr->tag, hdr->taglen);
+ *p++ = cpu_to_be32(hdr->minorversion);
hdr->nops_p = p;
- WRITE32(hdr->nops);
+ *p = cpu_to_be32(hdr->nops);
}
static void encode_nops(struct compound_hdr *hdr)
@@ -829,55 +811,53 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
len += 16;
else if (iap->ia_valid & ATTR_MTIME)
len += 4;
- RESERVE_SPACE(len);
+ p = reserve_space(xdr, len);
/*
* We write the bitmap length now, but leave the bitmap and the attribute
* buffer length to be backfilled at the end of this routine.
*/
- WRITE32(2);
+ *p++ = cpu_to_be32(2);
q = p;
p += 3;
if (iap->ia_valid & ATTR_SIZE) {
bmval0 |= FATTR4_WORD0_SIZE;
- WRITE64(iap->ia_size);
+ p = xdr_encode_hyper(p, iap->ia_size);
}
if (iap->ia_valid & ATTR_MODE) {
bmval1 |= FATTR4_WORD1_MODE;
- WRITE32(iap->ia_mode & S_IALLUGO);
+ *p++ = cpu_to_be32(iap->ia_mode & S_IALLUGO);
}
if (iap->ia_valid & ATTR_UID) {
bmval1 |= FATTR4_WORD1_OWNER;
- WRITE32(owner_namelen);
- WRITEMEM(owner_name, owner_namelen);
+ p = xdr_encode_opaque(p, owner_name, owner_namelen);
}
if (iap->ia_valid & ATTR_GID) {
bmval1 |= FATTR4_WORD1_OWNER_GROUP;
- WRITE32(owner_grouplen);
- WRITEMEM(owner_group, owner_grouplen);
+ p = xdr_encode_opaque(p, owner_group, owner_grouplen);
}
if (iap->ia_valid & ATTR_ATIME_SET) {
bmval1 |= FATTR4_WORD1_TIME_ACCESS_SET;
- WRITE32(NFS4_SET_TO_CLIENT_TIME);
- WRITE32(0);
- WRITE32(iap->ia_mtime.tv_sec);
- WRITE32(iap->ia_mtime.tv_nsec);
+ *p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME);
+ *p++ = cpu_to_be32(0);
+ *p++ = cpu_to_be32(iap->ia_mtime.tv_sec);
+ *p++ = cpu_to_be32(iap->ia_mtime.tv_nsec);
}
else if (iap->ia_valid & ATTR_ATIME) {
bmval1 |= FATTR4_WORD1_TIME_ACCESS_SET;
- WRITE32(NFS4_SET_TO_SERVER_TIME);
+ *p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME);
}
if (iap->ia_valid & ATTR_MTIME_SET) {
bmval1 |= FATTR4_WORD1_TIME_MODIFY_SET;
- WRITE32(NFS4_SET_TO_CLIENT_TIME);
- WRITE32(0);
- WRITE32(iap->ia_mtime.tv_sec);
- WRITE32(iap->ia_mtime.tv_nsec);
+ *p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME);
+ *p++ = cpu_to_be32(0);
+ *p++ = cpu_to_be32(iap->ia_mtime.tv_sec);
+ *p++ = cpu_to_be32(iap->ia_mtime.tv_nsec);
}
else if (iap->ia_valid & ATTR_MTIME) {
bmval1 |= FATTR4_WORD1_TIME_MODIFY_SET;
- WRITE32(NFS4_SET_TO_SERVER_TIME);
+ *p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME);
}
/*
@@ -891,7 +871,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
len = (char *)p - (char *)q - 12;
*q++ = htonl(bmval0);
*q++ = htonl(bmval1);
- *q++ = htonl(len);
+ *q = htonl(len);
/* out: */
}
@@ -900,9 +880,9 @@ static void encode_access(struct xdr_stream *xdr, u32 access, struct compound_hd
{
__be32 *p;
- RESERVE_SPACE(8);
- WRITE32(OP_ACCESS);
- WRITE32(access);
+ p = reserve_space(xdr, 8);
+ *p++ = cpu_to_be32(OP_ACCESS);
+ *p = cpu_to_be32(access);
hdr->nops++;
hdr->replen += decode_access_maxsz;
}
@@ -911,10 +891,10 @@ static void encode_close(struct xdr_stream *xdr, const struct nfs_closeargs *arg
{
__be32 *p;
- RESERVE_SPACE(8+NFS4_STATEID_SIZE);
- WRITE32(OP_CLOSE);
- WRITE32(arg->seqid->sequence->counter);
- WRITEMEM(arg->stateid->data, NFS4_STATEID_SIZE);
+ p = reserve_space(xdr, 8+NFS4_STATEID_SIZE);
+ *p++ = cpu_to_be32(OP_CLOSE);
+ *p++ = cpu_to_be32(arg->seqid->sequence->counter);
+ xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE);
hdr->nops++;
hdr->replen += decode_close_maxsz;
}
@@ -923,10 +903,10 @@ static void encode_commit(struct xdr_stream *xdr, const struct nfs_writeargs *ar
{
__be32 *p;
- RESERVE_SPACE(16);
- WRITE32(OP_COMMIT);
- WRITE64(args->offset);
- WRITE32(args->count);
+ p = reserve_space(xdr, 16);
+ *p++ = cpu_to_be32(OP_COMMIT);
+ p = xdr_encode_hyper(p, args->offset);
+ *p = cpu_to_be32(args->count);
hdr->nops++;
hdr->replen += decode_commit_maxsz;
}
@@ -935,30 +915,28 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *
{
__be32 *p;
- RESERVE_SPACE(8);
- WRITE32(OP_CREATE);
- WRITE32(create->ftype);
+ p = reserve_space(xdr, 8);
+ *p++ = cpu_to_be32(OP_CREATE);
+ *p = cpu_to_be32(create->ftype);
switch (create->ftype) {
case NF4LNK:
- RESERVE_SPACE(4);
- WRITE32(create->u.symlink.len);
+ p = reserve_space(xdr, 4);
+ *p = cpu_to_be32(create->u.symlink.len);
xdr_write_pages(xdr, create->u.symlink.pages, 0, create->u.symlink.len);
break;
case NF4BLK: case NF4CHR:
- RESERVE_SPACE(8);
- WRITE32(create->u.device.specdata1);
- WRITE32(create->u.device.specdata2);
+ p = reserve_space(xdr, 8);
+ *p++ = cpu_to_be32(create->u.device.specdata1);
+ *p = cpu_to_be32(create->u.device.specdata2);
break;
default:
break;
}
- RESERVE_SPACE(4 + create->name->len);
- WRITE32(create->name->len);
- WRITEMEM(create->name->name, create->name->len);
+ encode_string(xdr, create->name->len, create->name->name);
hdr->nops++;
hdr->replen += decode_create_maxsz;
@@ -969,10 +947,10 @@ static void encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap, struct c
{
__be32 *p;
- RESERVE_SPACE(12);
- WRITE32(OP_GETATTR);
- WRITE32(1);
- WRITE32(bitmap);
+ p = reserve_space(xdr, 12);
+ *p++ = cpu_to_be32(OP_GETATTR);
+ *p++ = cpu_to_be32(1);
+ *p = cpu_to_be32(bitmap);
hdr->nops++;
hdr->replen += decode_getattr_maxsz;
}
@@ -981,11 +959,11 @@ static void encode_getattr_two(struct xdr_stream *xdr, uint32_t bm0, uint32_t bm
{
__be32 *p;
- RESERVE_SPACE(16);
- WRITE32(OP_GETATTR);
- WRITE32(2);
- WRITE32(bm0);
- WRITE32(bm1);
+ p = reserve_space(xdr, 16);
+ *p++ = cpu_to_be32(OP_GETATTR);
+ *p++ = cpu_to_be32(2);
+ *p++ = cpu_to_be32(bm0);
+ *p = cpu_to_be32(bm1);
hdr->nops++;
hdr->replen += decode_getattr_maxsz;
}
@@ -1012,8 +990,8 @@ static void encode_getfh(struct xdr_stream *xdr, struct compound_hdr *hdr)
{
__be32 *p;
- RESERVE_SPACE(4);
- WRITE32(OP_GETFH);
+ p = reserve_space(xdr, 4);
+ *p = cpu_to_be32(OP_GETFH);
hdr->nops++;
hdr->replen += decode_getfh_maxsz;
}
@@ -1022,10 +1000,9 @@ static void encode_link(struct xdr_stream *xdr, const struct qstr *name, struct
{
__be32 *p;
- RESERVE_SPACE(8 + name->len);
- WRITE32(OP_LINK);
- WRITE32(name->len);
- WRITEMEM(name->name, name->len);
+ p = reserve_space(xdr, 8 + name->len);
+ *p++ = cpu_to_be32(OP_LINK);
+ xdr_encode_opaque(p, name->name, name->len);
hdr->nops++;
hdr->replen += decode_link_maxsz;
}
@@ -1052,27 +1029,27 @@ static void encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args
{
__be32 *p;
- RESERVE_SPACE(32);
- WRITE32(OP_LOCK);
- WRITE32(nfs4_lock_type(args->fl, args->block));
- WRITE32(args->reclaim);
- WRITE64(args->fl->fl_start);
- WRITE64(nfs4_lock_length(args->fl));
- WRITE32(args->new_lock_owner);
+ p = reserve_space(xdr, 32);
+ *p++ = cpu_to_be32(OP_LOCK);
+ *p++ = cpu_to_be32(nfs4_lock_type(args->fl, args->block));
+ *p++ = cpu_to_be32(args->reclaim);
+ p = xdr_encode_hyper(p, args->fl->fl_start);
+ p = xdr_encode_hyper(p, nfs4_lock_length(args->fl));
+ *p = cpu_to_be32(args->new_lock_owner);
if (args->new_lock_owner){
- RESERVE_SPACE(4+NFS4_STATEID_SIZE+32);
- WRITE32(args->open_seqid->sequence->counter);
- WRITEMEM(args->open_stateid->data, NFS4_STATEID_SIZE);
- WRITE32(args->lock_seqid->sequence->counter);
- WRITE64(args->lock_owner.clientid);
- WRITE32(16);
- WRITEMEM("lock id:", 8);
- WRITE64(args->lock_owner.id);
+ p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+32);
+ *p++ = cpu_to_be32(args->open_seqid->sequence->counter);
+ p = xdr_encode_opaque_fixed(p, args->open_stateid->data, NFS4_STATEID_SIZE);
+ *p++ = cpu_to_be32(args->lock_seqid->sequence->counter);
+ p = xdr_encode_hyper(p, args->lock_owner.clientid);
+ *p++ = cpu_to_be32(16);
+ p = xdr_encode_opaque_fixed(p, "lock id:", 8);
+ xdr_encode_hyper(p, args->lock_owner.id);
}
else {
- RESERVE_SPACE(NFS4_STATEID_SIZE+4);
- WRITEMEM(args->lock_stateid->data, NFS4_STATEID_SIZE);
- WRITE32(args->lock_seqid->sequence->counter);
+ p = reserve_space(xdr, NFS4_STATEID_SIZE+4);
+ p = xdr_encode_opaque_fixed(p, args->lock_stateid->data, NFS4_STATEID_SIZE);
+ *p = cpu_to_be32(args->lock_seqid->sequence->counter);
}
hdr->nops++;
hdr->replen += decode_lock_maxsz;
@@ -1082,15 +1059,15 @@ static void encode_lockt(struct xdr_stream *xdr, const struct nfs_lockt_args *ar
{
__be32 *p;
- RESERVE_SPACE(52);
- WRITE32(OP_LOCKT);
- WRITE32(nfs4_lock_type(args->fl, 0));
- WRITE64(args->fl->fl_start);
- WRITE64(nfs4_lock_length(args->fl));
- WRITE64(args->lock_owner.clientid);
- WRITE32(16);
- WRITEMEM("lock id:", 8);
- WRITE64(args->lock_owner.id);
+ p = reserve_space(xdr, 52);
+ *p++ = cpu_to_be32(OP_LOCKT);
+ *p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0));
+ p = xdr_encode_hyper(p, args->fl->fl_start);
+ p = xdr_encode_hyper(p, nfs4_lock_length(args->fl));
+ p = xdr_encode_hyper(p, args->lock_owner.clientid);
+ *p++ = cpu_to_be32(16);
+ p = xdr_encode_opaque_fixed(p, "lock id:", 8);
+ xdr_encode_hyper(p, args->lock_owner.id);
hdr->nops++;
hdr->replen += decode_lockt_maxsz;
}
@@ -1099,13 +1076,13 @@ static void encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *ar
{
__be32 *p;
- RESERVE_SPACE(12+NFS4_STATEID_SIZE+16);
- WRITE32(OP_LOCKU);
- WRITE32(nfs4_lock_type(args->fl, 0));
- WRITE32(args->seqid->sequence->counter);
- WRITEMEM(args->stateid->data, NFS4_STATEID_SIZE);
- WRITE64(args->fl->fl_start);
- WRITE64(nfs4_lock_length(args->fl));
+ p = reserve_space(xdr, 12+NFS4_STATEID_SIZE+16);
+ *p++ = cpu_to_be32(OP_LOCKU);
+ *p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0));
+ *p++ = cpu_to_be32(args->seqid->sequence->counter);
+ p = xdr_encode_opaque_fixed(p, args->stateid->data, NFS4_STATEID_SIZE);
+ p = xdr_encode_hyper(p, args->fl->fl_start);
+ xdr_encode_hyper(p, nfs4_lock_length(args->fl));
hdr->nops++;
hdr->replen += decode_locku_maxsz;
}
@@ -1115,10 +1092,9 @@ static void encode_lookup(struct xdr_stream *xdr, const struct qstr *name, struc
int len = name->len;
__be32 *p;
- RESERVE_SPACE(8 + len);
- WRITE32(OP_LOOKUP);
- WRITE32(len);
- WRITEMEM(name->name, len);
+ p = reserve_space(xdr, 8 + len);
+ *p++ = cpu_to_be32(OP_LOOKUP);
+ xdr_encode_opaque(p, name->name, len);
hdr->nops++;
hdr->replen += decode_lookup_maxsz;
}
@@ -1127,21 +1103,21 @@ static void encode_share_access(struct xdr_stream *xdr, fmode_t fmode)
{
__be32 *p;
- RESERVE_SPACE(8);
+ p = reserve_space(xdr, 8);
switch (fmode & (FMODE_READ|FMODE_WRITE)) {
case FMODE_READ:
- WRITE32(NFS4_SHARE_ACCESS_READ);
+ *p++ = cpu_to_be32(NFS4_SHARE_ACCESS_READ);
break;
case FMODE_WRITE:
- WRITE32(NFS4_SHARE_ACCESS_WRITE);
+ *p++ = cpu_to_be32(NFS4_SHARE_ACCESS_WRITE);
break;
case FMODE_READ|FMODE_WRITE:
- WRITE32(NFS4_SHARE_ACCESS_BOTH);
+ *p++ = cpu_to_be32(NFS4_SHARE_ACCESS_BOTH);
break;
default:
- WRITE32(0);
+ *p++ = cpu_to_be32(0);
}
- WRITE32(0); /* for linux, share_deny = 0 always */
+ *p = cpu_to_be32(0); /* for linux, share_deny = 0 always */
}
static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_openargs *arg)
@@ -1151,29 +1127,29 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena
* opcode 4, seqid 4, share_access 4, share_deny 4, clientid 8, ownerlen 4,
* owner 4 = 32
*/
- RESERVE_SPACE(8);
- WRITE32(OP_OPEN);
- WRITE32(arg->seqid->sequence->counter);
+ p = reserve_space(xdr, 8);
+ *p++ = cpu_to_be32(OP_OPEN);
+ *p = cpu_to_be32(arg->seqid->sequence->counter);
encode_share_access(xdr, arg->fmode);
- RESERVE_SPACE(28);
- WRITE64(arg->clientid);
- WRITE32(16);
- WRITEMEM("open id:", 8);
- WRITE64(arg->id);
+ p = reserve_space(xdr, 28);
+ p = xdr_encode_hyper(p, arg->clientid);
+ *p++ = cpu_to_be32(16);
+ p = xdr_encode_opaque_fixed(p, "open id:", 8);
+ xdr_encode_hyper(p, arg->id);
}
static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_openargs *arg)
{
__be32 *p;
- RESERVE_SPACE(4);
+ p = reserve_space(xdr, 4);
switch(arg->open_flags & O_EXCL) {
case 0:
- WRITE32(NFS4_CREATE_UNCHECKED);
+ *p = cpu_to_be32(NFS4_CREATE_UNCHECKED);
encode_attrs(xdr, arg->u.attrs, arg->server);
break;
default:
- WRITE32(NFS4_CREATE_EXCLUSIVE);
+ *p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE);
encode_nfs4_verifier(xdr, &arg->u.verifier);
}
}
@@ -1182,14 +1158,14 @@ static void encode_opentype(struct xdr_stream *xdr, const struct nfs_openargs *a
{
__be32 *p;
- RESERVE_SPACE(4);
+ p = reserve_space(xdr, 4);
switch (arg->open_flags & O_CREAT) {
case 0:
- WRITE32(NFS4_OPEN_NOCREATE);
+ *p = cpu_to_be32(NFS4_OPEN_NOCREATE);
break;
default:
BUG_ON(arg->claim != NFS4_OPEN_CLAIM_NULL);
- WRITE32(NFS4_OPEN_CREATE);
+ *p = cpu_to_be32(NFS4_OPEN_CREATE);
encode_createmode(xdr, arg);
}
}
@@ -1198,16 +1174,16 @@ static inline void encode_delegation_type(struct xdr_stream *xdr, fmode_t delega
{
__be32 *p;
- RESERVE_SPACE(4);
+ p = reserve_space(xdr, 4);
switch (delegation_type) {
case 0:
- WRITE32(NFS4_OPEN_DELEGATE_NONE);
+ *p = cpu_to_be32(NFS4_OPEN_DELEGATE_NONE);
break;
case FMODE_READ:
- WRITE32(NFS4_OPEN_DELEGATE_READ);
+ *p = cpu_to_be32(NFS4_OPEN_DELEGATE_READ);
break;
case FMODE_WRITE|FMODE_READ:
- WRITE32(NFS4_OPEN_DELEGATE_WRITE);
+ *p = cpu_to_be32(NFS4_OPEN_DELEGATE_WRITE);
break;
default:
BUG();
@@ -1218,8 +1194,8 @@ static inline void encode_claim_null(struct xdr_stream *xdr, const struct qstr *
{
__be32 *p;
- RESERVE_SPACE(4);
- WRITE32(NFS4_OPEN_CLAIM_NULL);
+ p = reserve_space(xdr, 4);
+ *p = cpu_to_be32(NFS4_OPEN_CLAIM_NULL);
encode_string(xdr, name->len, name->name);
}
@@ -1227,8 +1203,8 @@ static inline void encode_claim_previous(struct xdr_stream *xdr, fmode_t type)
{
__be32 *p;
- RESERVE_SPACE(4);
- WRITE32(NFS4_OPEN_CLAIM_PREVIOUS);
+ p = reserve_space(xdr, 4);
+ *p = cpu_to_be32(NFS4_OPEN_CLAIM_PREVIOUS);
encode_delegation_type(xdr, type);
}
@@ -1236,9 +1212,9 @@ static inline void encode_claim_delegate_cur(struct xdr_stream *xdr, const struc
{
__be32 *p;
- RESERVE_SPACE(4+NFS4_STATEID_SIZE);
- WRITE32(NFS4_OPEN_CLAIM_DELEGATE_CUR);
- WRITEMEM(stateid->data, NFS4_STATEID_SIZE);
+ p = reserve_space(xdr, 4+NFS4_STATEID_SIZE);
+ *p++ = cpu_to_be32(NFS4_OPEN_CLAIM_DELEGATE_CUR);
+ xdr_encode_opaque_fixed(p, stateid->data, NFS4_STATEID_SIZE);
encode_string(xdr, name->len, name->name);
}
@@ -1267,10 +1243,10 @@ static void encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_co
{
__be32 *p;
- RESERVE_SPACE(4+NFS4_STATEID_SIZE+4);
- WRITE32(OP_OPEN_CONFIRM);
- WRITEMEM(arg->stateid->data, NFS4_STATEID_SIZE);
- WRITE32(arg->seqid->sequence->counter);
+ p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+4);
+ *p++ = cpu_to_be32(OP_OPEN_CONFIRM);
+ p = xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE);
+ *p = cpu_to_be32(arg->seqid->sequence->counter);
hdr->nops++;
hdr->replen += decode_open_confirm_maxsz;
}
@@ -1279,10 +1255,10 @@ static void encode_open_downgrade(struct xdr_stream *xdr, const struct nfs_close
{
__be32 *p;
- RESERVE_SPACE(4+NFS4_STATEID_SIZE+4);
- WRITE32(OP_OPEN_DOWNGRADE);
- WRITEMEM(arg->stateid->data, NFS4_STATEID_SIZE);
- WRITE32(arg->seqid->sequence->counter);
+ p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+4);
+ *p++ = cpu_to_be32(OP_OPEN_DOWNGRADE);
+ p = xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE);
+ *p = cpu_to_be32(arg->seqid->sequence->counter);
encode_share_access(xdr, arg->fmode);
hdr->nops++;
hdr->replen += decode_open_downgrade_maxsz;
@@ -1294,10 +1270,9 @@ encode_putfh(struct xdr_stream *xdr, const struct nfs_fh *fh, struct compound_hd
int len = fh->size;
__be32 *p;
- RESERVE_SPACE(8 + len);
- WRITE32(OP_PUTFH);
- WRITE32(len);
- WRITEMEM(fh->data, len);
+ p = reserve_space(xdr, 8 + len);
+ *p++ = cpu_to_be32(OP_PUTFH);
+ xdr_encode_opaque(p, fh->data, len);
hdr->nops++;
hdr->replen += decode_putfh_maxsz;
}
@@ -1306,8 +1281,8 @@ static void encode_putrootfh(struct xdr_stream *xdr, struct compound_hdr *hdr)
{
__be32 *p;
- RESERVE_SPACE(4);
- WRITE32(OP_PUTROOTFH);
+ p = reserve_space(xdr, 4);
+ *p = cpu_to_be32(OP_PUTROOTFH);
hdr->nops++;
hdr->replen += decode_putrootfh_maxsz;
}
@@ -1317,26 +1292,26 @@ static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context
nfs4_stateid stateid;
__be32 *p;
- RESERVE_SPACE(NFS4_STATEID_SIZE);
+ p = reserve_space(xdr, NFS4_STATEID_SIZE);
if (ctx->state != NULL) {
nfs4_copy_stateid(&stateid, ctx->state, ctx->lockowner);
- WRITEMEM(stateid.data, NFS4_STATEID_SIZE);
+ xdr_encode_opaque_fixed(p, stateid.data, NFS4_STATEID_SIZE);
} else
- WRITEMEM(zero_stateid.data, NFS4_STATEID_SIZE);
+ xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE);
}
static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args, struct compound_hdr *hdr)
{
__be32 *p;
- RESERVE_SPACE(4);
- WRITE32(OP_READ);
+ p = reserve_space(xdr, 4);
+ *p = cpu_to_be32(OP_READ);
encode_stateid(xdr, args->context);
- RESERVE_SPACE(12);
- WRITE64(args->offset);
- WRITE32(args->count);
+ p = reserve_space(xdr, 12);
+ p = xdr_encode_hyper(p, args->offset);
+ *p = cpu_to_be32(args->count);
hdr->nops++;
hdr->replen += decode_read_maxsz;
}
@@ -1349,20 +1324,20 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg
};
__be32 *p;
- RESERVE_SPACE(12+NFS4_VERIFIER_SIZE+20);
- WRITE32(OP_READDIR);
- WRITE64(readdir->cookie);
- WRITEMEM(readdir->verifier.data, NFS4_VERIFIER_SIZE);
- WRITE32(readdir->count >> 1); /* We're not doing readdirplus */
- WRITE32(readdir->count);
- WRITE32(2);
+ p = reserve_space(xdr, 12+NFS4_VERIFIER_SIZE+20);
+ *p++ = cpu_to_be32(OP_READDIR);
+ p = xdr_encode_hyper(p, readdir->cookie);
+ p = xdr_encode_opaque_fixed(p, readdir->verifier.data, NFS4_VERIFIER_SIZE);
+ *p++ = cpu_to_be32(readdir->count >> 1); /* We're not doing readdirplus */
+ *p++ = cpu_to_be32(readdir->count);
+ *p++ = cpu_to_be32(2);
/* Switch to mounted_on_fileid if the server supports it */
if (readdir->bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID)
attrs[0] &= ~FATTR4_WORD0_FILEID;
else
attrs[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID;
- WRITE32(attrs[0] & readdir->bitmask[0]);
- WRITE32(attrs[1] & readdir->bitmask[1]);
+ *p++ = cpu_to_be32(attrs[0] & readdir->bitmask[0]);
+ *p = cpu_to_be32(attrs[1] & readdir->bitmask[1]);
hdr->nops++;
hdr->replen += decode_readdir_maxsz;
dprintk("%s: cookie = %Lu, verifier = %08x:%08x, bitmap = %08x:%08x\n",
@@ -1378,8 +1353,8 @@ static void encode_readlink(struct xdr_stream *xdr, const struct nfs4_readlink *
{
__be32 *p;
- RESERVE_SPACE(4);
- WRITE32(OP_READLINK);
+ p = reserve_space(xdr, 4);
+ *p = cpu_to_be32(OP_READLINK);
hdr->nops++;
hdr->replen += decode_readlink_maxsz;
}
@@ -1388,10 +1363,9 @@ static void encode_remove(struct xdr_stream *xdr, const struct qstr *name, struc
{
__be32 *p;
- RESERVE_SPACE(8 + name->len);
- WRITE32(OP_REMOVE);
- WRITE32(name->len);
- WRITEMEM(name->name, name->len);
+ p = reserve_space(xdr, 8 + name->len);
+ *p++ = cpu_to_be32(OP_REMOVE);
+ xdr_encode_opaque(p, name->name, name->len);
hdr->nops++;
hdr->replen += decode_remove_maxsz;
}
@@ -1400,14 +1374,10 @@ static void encode_rename(struct xdr_stream *xdr, const struct qstr *oldname, co
{
__be32 *p;
- RESERVE_SPACE(8 + oldname->len);
- WRITE32(OP_RENAME);
- WRITE32(oldname->len);
- WRITEMEM(oldname->name, oldname->len);
-
- RESERVE_SPACE(4 + newname->len);
- WRITE32(newname->len);
- WRITEMEM(newname->name, newname->len);
+ p = reserve_space(xdr, 4);
+ *p = cpu_to_be32(OP_RENAME);
+ encode_string(xdr, oldname->len, oldname->name);
+ encode_string(xdr, newname->len, newname->name);
hdr->nops++;
hdr->replen += decode_rename_maxsz;
}
@@ -1416,9 +1386,9 @@ static void encode_renew(struct xdr_stream *xdr, const struct nfs_client *client
{
__be32 *p;
- RESERVE_SPACE(12);
- WRITE32(OP_RENEW);
- WRITE64(client_stateid->cl_clientid);
+ p = reserve_space(xdr, 12);
+ *p++ = cpu_to_be32(OP_RENEW);
+ xdr_encode_hyper(p, client_stateid->cl_clientid);
hdr->nops++;
hdr->replen += decode_renew_maxsz;
}
@@ -1428,8 +1398,8 @@ encode_restorefh(struct xdr_stream *xdr, struct compound_hdr *hdr)
{
__be32 *p;
- RESERVE_SPACE(4);
- WRITE32(OP_RESTOREFH);
+ p = reserve_space(xdr, 4);
+ *p = cpu_to_be32(OP_RESTOREFH);
hdr->nops++;
hdr->replen += decode_restorefh_maxsz;
}
@@ -1439,16 +1409,16 @@ encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg, struct compoun
{
__be32 *p;
- RESERVE_SPACE(4+NFS4_STATEID_SIZE);
- WRITE32(OP_SETATTR);
- WRITEMEM(zero_stateid.data, NFS4_STATEID_SIZE);
- RESERVE_SPACE(2*4);
- WRITE32(1);
- WRITE32(FATTR4_WORD0_ACL);
+ p = reserve_space(xdr, 4+NFS4_STATEID_SIZE);
+ *p++ = cpu_to_be32(OP_SETATTR);
+ xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE);
+ p = reserve_space(xdr, 2*4);
+ *p++ = cpu_to_be32(1);
+ *p = cpu_to_be32(FATTR4_WORD0_ACL);
if (arg->acl_len % 4)
return -EINVAL;
- RESERVE_SPACE(4);
- WRITE32(arg->acl_len);
+ p = reserve_space(xdr, 4);
+ *p = cpu_to_be32(arg->acl_len);
xdr_write_pages(xdr, arg->acl_pages, arg->acl_pgbase, arg->acl_len);
hdr->nops++;
hdr->replen += decode_setacl_maxsz;
@@ -1460,8 +1430,8 @@ encode_savefh(struct xdr_stream *xdr, struct compound_hdr *hdr)
{
__be32 *p;
- RESERVE_SPACE(4);
- WRITE32(OP_SAVEFH);
+ p = reserve_space(xdr, 4);
+ *p = cpu_to_be32(OP_SAVEFH);
hdr->nops++;
hdr->replen += decode_savefh_maxsz;
}
@@ -1470,9 +1440,9 @@ static void encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs
{
__be32 *p;
- RESERVE_SPACE(4+NFS4_STATEID_SIZE);
- WRITE32(OP_SETATTR);
- WRITEMEM(arg->stateid.data, NFS4_STATEID_SIZE);
+ p = reserve_space(xdr, 4+NFS4_STATEID_SIZE);
+ *p++ = cpu_to_be32(OP_SETATTR);
+ xdr_encode_opaque_fixed(p, arg->stateid.data, NFS4_STATEID_SIZE);
hdr->nops++;
hdr->replen += decode_setattr_maxsz;
encode_attrs(xdr, arg->iap, server);
@@ -1482,17 +1452,17 @@ static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclie
{
__be32 *p;
- RESERVE_SPACE(4 + NFS4_VERIFIER_SIZE);
- WRITE32(OP_SETCLIENTID);
- WRITEMEM(setclientid->sc_verifier->data, NFS4_VERIFIER_SIZE);
+ p = reserve_space(xdr, 4 + NFS4_VERIFIER_SIZE);
+ *p++ = cpu_to_be32(OP_SETCLIENTID);
+ xdr_encode_opaque_fixed(p, setclientid->sc_verifier->data, NFS4_VERIFIER_SIZE);
encode_string(xdr, setclientid->sc_name_len, setclientid->sc_name);
- RESERVE_SPACE(4);
- WRITE32(setclientid->sc_prog);
+ p = reserve_space(xdr, 4);
+ *p = cpu_to_be32(setclientid->sc_prog);
encode_string(xdr, setclientid->sc_netid_len, setclientid->sc_netid);
encode_string(xdr, setclientid->sc_uaddr_len, setclientid->sc_uaddr);
- RESERVE_SPACE(4);
- WRITE32(setclientid->sc_cb_ident);
+ p = reserve_space(xdr, 4);
+ *p = cpu_to_be32(setclientid->sc_cb_ident);
hdr->nops++;
hdr->replen += decode_setclientid_maxsz;
}
@@ -1501,10 +1471,10 @@ static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs_
{
__be32 *p;
- RESERVE_SPACE(12 + NFS4_VERIFIER_SIZE);
- WRITE32(OP_SETCLIENTID_CONFIRM);
- WRITE64(client_state->cl_clientid);
- WRITEMEM(client_state->cl_confirm.data, NFS4_VERIFIER_SIZE);
+ p = reserve_space(xdr, 12 + NFS4_VERIFIER_SIZE);
+ *p++ = cpu_to_be32(OP_SETCLIENTID_CONFIRM);
+ p = xdr_encode_hyper(p, client_state->cl_clientid);
+ xdr_encode_opaque_fixed(p, client_state->cl_confirm.data, NFS4_VERIFIER_SIZE);
hdr->nops++;
hdr->replen += decode_setclientid_confirm_maxsz;
}
@@ -1513,15 +1483,15 @@ static void encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *arg
{
__be32 *p;
- RESERVE_SPACE(4);
- WRITE32(OP_WRITE);
+ p = reserve_space(xdr, 4);
+ *p = cpu_to_be32(OP_WRITE);
encode_stateid(xdr, args->context);
- RESERVE_SPACE(16);
- WRITE64(args->offset);
- WRITE32(args->stable);
- WRITE32(args->count);
+ p = reserve_space(xdr, 16);
+ p = xdr_encode_hyper(p, args->offset);
+ *p++ = cpu_to_be32(args->stable);
+ *p = cpu_to_be32(args->count);
xdr_write_pages(xdr, args->pages, args->pgbase, args->count);
hdr->nops++;
@@ -1532,10 +1502,10 @@ static void encode_delegreturn(struct xdr_stream *xdr, const nfs4_stateid *state
{
__be32 *p;
- RESERVE_SPACE(4+NFS4_STATEID_SIZE);
+ p = reserve_space(xdr, 4+NFS4_STATEID_SIZE);
- WRITE32(OP_DELEGRETURN);
- WRITEMEM(stateid->data, NFS4_STATEID_SIZE);
+ *p++ = cpu_to_be32(OP_DELEGRETURN);
+ xdr_encode_opaque_fixed(p, stateid->data, NFS4_STATEID_SIZE);
hdr->nops++;
hdr->replen += decode_delegreturn_maxsz;
}
@@ -1548,16 +1518,16 @@ static void encode_exchange_id(struct xdr_stream *xdr,
{
__be32 *p;
- RESERVE_SPACE(4 + sizeof(args->verifier->data));
- WRITE32(OP_EXCHANGE_ID);
- WRITEMEM(args->verifier->data, sizeof(args->verifier->data));
+ p = reserve_space(xdr, 4 + sizeof(args->verifier->data));
+ *p++ = cpu_to_be32(OP_EXCHANGE_ID);
+ xdr_encode_opaque_fixed(p, args->verifier->data, sizeof(args->verifier->data));
encode_string(xdr, args->id_len, args->id);
- RESERVE_SPACE(12);
- WRITE32(args->flags);
- WRITE32(0); /* zero length state_protect4_a */
- WRITE32(0); /* zero length implementation id array */
+ p = reserve_space(xdr, 12);
+ *p++ = cpu_to_be32(args->flags);
+ *p++ = cpu_to_be32(0); /* zero length state_protect4_a */
+ *p = cpu_to_be32(0); /* zero length implementation id array */
hdr->nops++;
hdr->replen += decode_exchange_id_maxsz;
}
@@ -1571,55 +1541,43 @@ static void encode_create_session(struct xdr_stream *xdr,
uint32_t len;
struct nfs_client *clp = args->client;
- RESERVE_SPACE(4);
- WRITE32(OP_CREATE_SESSION);
-
- RESERVE_SPACE(8);
- WRITE64(clp->cl_ex_clid);
+ len = scnprintf(machine_name, sizeof(machine_name), "%s",
+ clp->cl_ipaddr);
- RESERVE_SPACE(8);
- WRITE32(clp->cl_seqid); /*Sequence id */
- WRITE32(args->flags); /*flags */
+ p = reserve_space(xdr, 20 + 2*28 + 20 + len + 12);
+ *p++ = cpu_to_be32(OP_CREATE_SESSION);
+ p = xdr_encode_hyper(p, clp->cl_ex_clid);
+ *p++ = cpu_to_be32(clp->cl_seqid); /*Sequence id */
+ *p++ = cpu_to_be32(args->flags); /*flags */
- RESERVE_SPACE(2*28); /* 2 channel_attrs */
/* Fore Channel */
- WRITE32(args->fc_attrs.headerpadsz); /* header padding size */
- WRITE32(args->fc_attrs.max_rqst_sz); /* max req size */
- WRITE32(args->fc_attrs.max_resp_sz); /* max resp size */
- WRITE32(args->fc_attrs.max_resp_sz_cached); /* Max resp sz cached */
- WRITE32(args->fc_attrs.max_ops); /* max operations */
- WRITE32(args->fc_attrs.max_reqs); /* max requests */
- WRITE32(0); /* rdmachannel_attrs */
+ *p++ = cpu_to_be32(args->fc_attrs.headerpadsz); /* header padding size */
+ *p++ = cpu_to_be32(args->fc_attrs.max_rqst_sz); /* max req size */
+ *p++ = cpu_to_be32(args->fc_attrs.max_resp_sz); /* max resp size */
+ *p++ = cpu_to_be32(args->fc_attrs.max_resp_sz_cached); /* Max resp sz cached */
+ *p++ = cpu_to_be32(args->fc_attrs.max_ops); /* max operations */
+ *p++ = cpu_to_be32(args->fc_attrs.max_reqs); /* max requests */
+ *p++ = cpu_to_be32(0); /* rdmachannel_attrs */
/* Back Channel */
- WRITE32(args->fc_attrs.headerpadsz); /* header padding size */
- WRITE32(args->bc_attrs.max_rqst_sz); /* max req size */
- WRITE32(args->bc_attrs.max_resp_sz); /* max resp size */
- WRITE32(args->bc_attrs.max_resp_sz_cached); /* Max resp sz cached */
- WRITE32(args->bc_attrs.max_ops); /* max operations */
- WRITE32(args->bc_attrs.max_reqs); /* max requests */
- WRITE32(0); /* rdmachannel_attrs */
-
- RESERVE_SPACE(4);
- WRITE32(args->cb_program); /* cb_program */
-
- RESERVE_SPACE(4); /* # of security flavors */
- WRITE32(1);
-
- RESERVE_SPACE(4);
- WRITE32(RPC_AUTH_UNIX); /* auth_sys */
+ *p++ = cpu_to_be32(args->fc_attrs.headerpadsz); /* header padding size */
+ *p++ = cpu_to_be32(args->bc_attrs.max_rqst_sz); /* max req size */
+ *p++ = cpu_to_be32(args->bc_attrs.max_resp_sz); /* max resp size */
+ *p++ = cpu_to_be32(args->bc_attrs.max_resp_sz_cached); /* Max resp sz cached */
+ *p++ = cpu_to_be32(args->bc_attrs.max_ops); /* max operations */
+ *p++ = cpu_to_be32(args->bc_attrs.max_reqs); /* max requests */
+ *p++ = cpu_to_be32(0); /* rdmachannel_attrs */
+
+ *p++ = cpu_to_be32(args->cb_program); /* cb_program */
+ *p++ = cpu_to_be32(1);
+ *p++ = cpu_to_be32(RPC_AUTH_UNIX); /* auth_sys */
/* authsys_parms rfc1831 */
- RESERVE_SPACE(4);
- WRITE32((u32)clp->cl_boot_time.tv_nsec); /* stamp */
- len = scnprintf(machine_name, sizeof(machine_name), "%s",
- clp->cl_ipaddr);
- RESERVE_SPACE(16 + len);
- WRITE32(len);
- WRITEMEM(machine_name, len);
- WRITE32(0); /* UID */
- WRITE32(0); /* GID */
- WRITE32(0); /* No more gids */
+ *p++ = cpu_to_be32((u32)clp->cl_boot_time.tv_nsec); /* stamp */
+ p = xdr_encode_opaque(p, machine_name, len);
+ *p++ = cpu_to_be32(0); /* UID */
+ *p++ = cpu_to_be32(0); /* GID */
+ *p = cpu_to_be32(0); /* No more gids */
hdr->nops++;
hdr->replen += decode_create_session_maxsz;
}
@@ -1629,9 +1587,9 @@ static void encode_destroy_session(struct xdr_stream *xdr,
struct compound_hdr *hdr)
{
__be32 *p;
- RESERVE_SPACE(4 + NFS4_MAX_SESSIONID_LEN);
- WRITE32(OP_DESTROY_SESSION);
- WRITEMEM(session->sess_id.data, NFS4_MAX_SESSIONID_LEN);
+ p = reserve_space(xdr, 4 + NFS4_MAX_SESSIONID_LEN);
+ *p++ = cpu_to_be32(OP_DESTROY_SESSION);
+ xdr_encode_opaque_fixed(p, session->sess_id.data, NFS4_MAX_SESSIONID_LEN);
hdr->nops++;
hdr->replen += decode_destroy_session_maxsz;
}
@@ -1655,8 +1613,8 @@ static void encode_sequence(struct xdr_stream *xdr,
WARN_ON(args->sa_slotid == NFS4_MAX_SLOT_TABLE);
slot = tp->slots + args->sa_slotid;
- RESERVE_SPACE(4);
- WRITE32(OP_SEQUENCE);
+ p = reserve_space(xdr, 4 + NFS4_MAX_SESSIONID_LEN + 16);
+ *p++ = cpu_to_be32(OP_SEQUENCE);
/*
* Sessionid + seqid + slotid + max slotid + cache_this
@@ -1670,12 +1628,11 @@ static void encode_sequence(struct xdr_stream *xdr,
((u32 *)session->sess_id.data)[3],
slot->seq_nr, args->sa_slotid,
tp->highest_used_slotid, args->sa_cache_this);
- RESERVE_SPACE(NFS4_MAX_SESSIONID_LEN + 16);
- WRITEMEM(session->sess_id.data, NFS4_MAX_SESSIONID_LEN);
- WRITE32(slot->seq_nr);
- WRITE32(args->sa_slotid);
- WRITE32(tp->highest_used_slotid);
- WRITE32(args->sa_cache_this);
+ p = xdr_encode_opaque_fixed(p, session->sess_id.data, NFS4_MAX_SESSIONID_LEN);
+ *p++ = cpu_to_be32(slot->seq_nr);
+ *p++ = cpu_to_be32(args->sa_slotid);
+ *p++ = cpu_to_be32(tp->highest_used_slotid);
+ *p = cpu_to_be32(args->sa_cache_this);
hdr->nops++;
hdr->replen += decode_sequence_maxsz;
#endif /* CONFIG_NFS_V4_1 */
@@ -2466,68 +2423,53 @@ static int nfs4_xdr_enc_get_lease_time(struct rpc_rqst *req, uint32_t *p,
}
#endif /* CONFIG_NFS_V4_1 */
-/*
- * START OF "GENERIC" DECODE ROUTINES.
- * These may look a little ugly since they are imported from a "generic"
- * set of XDR encode/decode routines which are intended to be shared by
- * all of our NFSv4 implementations (OpenBSD, MacOS X...).
- *
- * If the pain of reading these is too great, it should be a straightforward
- * task to translate them into Linux-specific versions which are more
- * consistent with the style used in NFSv2/v3...
- */
-#define READ32(x) (x) = ntohl(*p++)
-#define READ64(x) do { \
- (x) = (u64)ntohl(*p++) << 32; \
- (x) |= ntohl(*p++); \
-} while (0)
-#define READTIME(x) do { \
- p++; \
- (x.tv_sec) = ntohl(*p++); \
- (x.tv_nsec) = ntohl(*p++); \
-} while (0)
-#define COPYMEM(x,nbytes) do { \
- memcpy((x), p, nbytes); \
- p += XDR_QUADLEN(nbytes); \
-} while (0)
-
-#define READ_BUF(nbytes) do { \
- p = xdr_inline_decode(xdr, nbytes); \
- if (unlikely(!p)) { \
- dprintk("nfs: %s: prematurely hit end of receive" \
- " buffer\n", __func__); \
- dprintk("nfs: %s: xdr->p=%p, bytes=%u, xdr->end=%p\n", \
- __func__, xdr->p, nbytes, xdr->end); \
- return -EIO; \
- } \
-} while (0)
+static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
+{
+ dprintk("nfs: %s: prematurely hit end of receive buffer. "
+ "Remaining buffer length is %tu words.\n",
+ func, xdr->end - xdr->p);
+}
static int decode_opaque_inline(struct xdr_stream *xdr, unsigned int *len, char **string)
{
__be32 *p;
- READ_BUF(4);
- READ32(*len);
- READ_BUF(*len);
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ goto out_overflow;
+ *len = be32_to_cpup(p);
+ p = xdr_inline_decode(xdr, *len);
+ if (unlikely(!p))
+ goto out_overflow;
*string = (char *)p;
return 0;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
static int decode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr)
{
__be32 *p;
- READ_BUF(8);
- READ32(hdr->status);
- READ32(hdr->taglen);
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ goto out_overflow;
+ hdr->status = be32_to_cpup(p++);
+ hdr->taglen = be32_to_cpup(p);
- READ_BUF(hdr->taglen + 4);
+ p = xdr_inline_decode(xdr, hdr->taglen + 4);
+ if (unlikely(!p))
+ goto out_overflow;
hdr->tag = (char *)p;
p += XDR_QUADLEN(hdr->taglen);
- READ32(hdr->nops);
+ hdr->nops = be32_to_cpup(p);
if (unlikely(hdr->nops < 1))
return nfs4_stat_to_errno(hdr->status);
return 0;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
@@ -2536,18 +2478,23 @@ static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
uint32_t opnum;
int32_t nfserr;
- READ_BUF(8);
- READ32(opnum);
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ goto out_overflow;
+ opnum = be32_to_cpup(p++);
if (opnum != expected) {
dprintk("nfs: Server returned operation"
" %d but we issued a request for %d\n",
opnum, expected);
return -EIO;
}
- READ32(nfserr);
+ nfserr = be32_to_cpup(p);
if (nfserr != NFS_OK)
return nfs4_stat_to_errno(nfserr);
return 0;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
/* Dummy routine */
@@ -2557,8 +2504,11 @@ static int decode_ace(struct xdr_stream *xdr, void *ace, struct nfs_client *clp)
unsigned int strlen;
char *str;
- READ_BUF(12);
- return decode_opaque_inline(xdr, &strlen, &str);
+ p = xdr_inline_decode(xdr, 12);
+ if (likely(p))
+ return decode_opaque_inline(xdr, &strlen, &str);
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap)
@@ -2566,27 +2516,39 @@ static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap)
uint32_t bmlen;
__be32 *p;
- READ_BUF(4);
- READ32(bmlen);
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ goto out_overflow;
+ bmlen = be32_to_cpup(p);
bitmap[0] = bitmap[1] = 0;
- READ_BUF((bmlen << 2));
+ p = xdr_inline_decode(xdr, (bmlen << 2));
+ if (unlikely(!p))
+ goto out_overflow;
if (bmlen > 0) {
- READ32(bitmap[0]);
+ bitmap[0] = be32_to_cpup(p++);
if (bmlen > 1)
- READ32(bitmap[1]);
+ bitmap[1] = be32_to_cpup(p);
}
return 0;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
static inline int decode_attr_length(struct xdr_stream *xdr, uint32_t *attrlen, __be32 **savep)
{
__be32 *p;
- READ_BUF(4);
- READ32(*attrlen);
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ goto out_overflow;
+ *attrlen = be32_to_cpup(p);
*savep = xdr->p;
return 0;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
static int decode_attr_supported(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *bitmask)
@@ -2609,8 +2571,10 @@ static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *
if (unlikely(bitmap[0] & (FATTR4_WORD0_TYPE - 1U)))
return -EIO;
if (likely(bitmap[0] & FATTR4_WORD0_TYPE)) {
- READ_BUF(4);
- READ32(*type);
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ goto out_overflow;
+ *type = be32_to_cpup(p);
if (*type < NF4REG || *type > NF4NAMEDATTR) {
dprintk("%s: bad type %d\n", __func__, *type);
return -EIO;
@@ -2620,6 +2584,9 @@ static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *
}
dprintk("%s: type=0%o\n", __func__, nfs_type2fmt[*type]);
return ret;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *change)
@@ -2631,14 +2598,19 @@ static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t
if (unlikely(bitmap[0] & (FATTR4_WORD0_CHANGE - 1U)))
return -EIO;
if (likely(bitmap[0] & FATTR4_WORD0_CHANGE)) {
- READ_BUF(8);
- READ64(*change);
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ goto out_overflow;
+ xdr_decode_hyper(p, change);
bitmap[0] &= ~FATTR4_WORD0_CHANGE;
ret = NFS_ATTR_FATTR_CHANGE;
}
dprintk("%s: change attribute=%Lu\n", __func__,
(unsigned long long)*change);
return ret;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *size)
@@ -2650,13 +2622,18 @@ static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *
if (unlikely(bitmap[0] & (FATTR4_WORD0_SIZE - 1U)))
return -EIO;
if (likely(bitmap[0] & FATTR4_WORD0_SIZE)) {
- READ_BUF(8);
- READ64(*size);
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ goto out_overflow;
+ xdr_decode_hyper(p, size);
bitmap[0] &= ~FATTR4_WORD0_SIZE;
ret = NFS_ATTR_FATTR_SIZE;
}
dprintk("%s: file size=%Lu\n", __func__, (unsigned long long)*size);
return ret;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
static int decode_attr_link_support(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -2667,12 +2644,17 @@ static int decode_attr_link_support(struct xdr_stream *xdr, uint32_t *bitmap, ui
if (unlikely(bitmap[0] & (FATTR4_WORD0_LINK_SUPPORT - 1U)))
return -EIO;
if (likely(bitmap[0] & FATTR4_WORD0_LINK_SUPPORT)) {
- READ_BUF(4);
- READ32(*res);
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ goto out_overflow;
+ *res = be32_to_cpup(p);
bitmap[0] &= ~FATTR4_WORD0_LINK_SUPPORT;
}
dprintk("%s: link support=%s\n", __func__, *res == 0 ? "false" : "true");
return 0;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -2683,12 +2665,17 @@ static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap,
if (unlikely(bitmap[0] & (FATTR4_WORD0_SYMLINK_SUPPORT - 1U)))
return -EIO;
if (likely(bitmap[0] & FATTR4_WORD0_SYMLINK_SUPPORT)) {
- READ_BUF(4);
- READ32(*res);
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ goto out_overflow;
+ *res = be32_to_cpup(p);
bitmap[0] &= ~FATTR4_WORD0_SYMLINK_SUPPORT;
}
dprintk("%s: symlink support=%s\n", __func__, *res == 0 ? "false" : "true");
return 0;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fsid *fsid)
@@ -2701,9 +2688,11 @@ static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs
if (unlikely(bitmap[0] & (FATTR4_WORD0_FSID - 1U)))
return -EIO;
if (likely(bitmap[0] & FATTR4_WORD0_FSID)) {
- READ_BUF(16);
- READ64(fsid->major);
- READ64(fsid->minor);
+ p = xdr_inline_decode(xdr, 16);
+ if (unlikely(!p))
+ goto out_overflow;
+ p = xdr_decode_hyper(p, &fsid->major);
+ xdr_decode_hyper(p, &fsid->minor);
bitmap[0] &= ~FATTR4_WORD0_FSID;
ret = NFS_ATTR_FATTR_FSID;
}
@@ -2711,6 +2700,9 @@ static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs
(unsigned long long)fsid->major,
(unsigned long long)fsid->minor);
return ret;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
static int decode_attr_lease_time(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -2721,12 +2713,17 @@ static int decode_attr_lease_time(struct xdr_stream *xdr, uint32_t *bitmap, uint
if (unlikely(bitmap[0] & (FATTR4_WORD0_LEASE_TIME - 1U)))
return -EIO;
if (likely(bitmap[0] & FATTR4_WORD0_LEASE_TIME)) {
- READ_BUF(4);
- READ32(*res);
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ goto out_overflow;
+ *res = be32_to_cpup(p);
bitmap[0] &= ~FATTR4_WORD0_LEASE_TIME;
}
dprintk("%s: file size=%u\n", __func__, (unsigned int)*res);
return 0;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -2737,12 +2734,17 @@ static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint
if (unlikely(bitmap[0] & (FATTR4_WORD0_ACLSUPPORT - 1U)))
return -EIO;
if (likely(bitmap[0] & FATTR4_WORD0_ACLSUPPORT)) {
- READ_BUF(4);
- READ32(*res);
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ goto out_overflow;
+ *res = be32_to_cpup(p);
bitmap[0] &= ~FATTR4_WORD0_ACLSUPPORT;
}
dprintk("%s: ACLs supported=%u\n", __func__, (unsigned int)*res);
return 0;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid)
@@ -2754,13 +2756,18 @@ static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t
if (unlikely(bitmap[0] & (FATTR4_WORD0_FILEID - 1U)))
return -EIO;
if (likely(bitmap[0] & FATTR4_WORD0_FILEID)) {
- READ_BUF(8);
- READ64(*fileid);
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ goto out_overflow;
+ xdr_decode_hyper(p, fileid);
bitmap[0] &= ~FATTR4_WORD0_FILEID;
ret = NFS_ATTR_FATTR_FILEID;
}
dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid);
return ret;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid)
@@ -2772,13 +2779,18 @@ static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitma
if (unlikely(bitmap[1] & (FATTR4_WORD1_MOUNTED_ON_FILEID - 1U)))
return -EIO;
if (likely(bitmap[1] & FATTR4_WORD1_MOUNTED_ON_FILEID)) {
- READ_BUF(8);
- READ64(*fileid);
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ goto out_overflow;
+ xdr_decode_hyper(p, fileid);
bitmap[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID;
ret = NFS_ATTR_FATTR_FILEID;
}
dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid);
return ret;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -2790,12 +2802,17 @@ static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uin
if (unlikely(bitmap[0] & (FATTR4_WORD0_FILES_AVAIL - 1U)))
return -EIO;
if (likely(bitmap[0] & FATTR4_WORD0_FILES_AVAIL)) {
- READ_BUF(8);
- READ64(*res);
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ goto out_overflow;
+ xdr_decode_hyper(p, res);
bitmap[0] &= ~FATTR4_WORD0_FILES_AVAIL;
}
dprintk("%s: files avail=%Lu\n", __func__, (unsigned long long)*res);
return status;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
static int decode_attr_files_free(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -2807,12 +2824,17 @@ static int decode_attr_files_free(struct xdr_stream *xdr, uint32_t *bitmap, uint
if (unlikely(bitmap[0] & (FATTR4_WORD0_FILES_FREE - 1U)))
return -EIO;
if (likely(bitmap[0] & FATTR4_WORD0_FILES_FREE)) {
- READ_BUF(8);
- READ64(*res);
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ goto out_overflow;
+ xdr_decode_hyper(p, res);
bitmap[0] &= ~FATTR4_WORD0_FILES_FREE;
}
dprintk("%s: files free=%Lu\n", __func__, (unsigned long long)*res);
return status;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
static int decode_attr_files_total(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -2824,12 +2846,17 @@ static int decode_attr_files_total(struct xdr_stream *xdr, uint32_t *bitmap, uin
if (unlikely(bitmap[0] & (FATTR4_WORD0_FILES_TOTAL - 1U)))
return -EIO;
if (likely(bitmap[0] & FATTR4_WORD0_FILES_TOTAL)) {
- READ_BUF(8);
- READ64(*res);
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ goto out_overflow;
+ xdr_decode_hyper(p, res);
bitmap[0] &= ~FATTR4_WORD0_FILES_TOTAL;
}
dprintk("%s: files total=%Lu\n", __func__, (unsigned long long)*res);
return status;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path)
@@ -2838,8 +2865,10 @@ static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path)
__be32 *p;
int status = 0;
- READ_BUF(4);
- READ32(n);
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ goto out_overflow;
+ n = be32_to_cpup(p);
if (n == 0)
goto root_path;
dprintk("path ");
@@ -2873,6 +2902,9 @@ out_eio:
dprintk(" status %d", status);
status = -EIO;
goto out;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs4_fs_locations *res)
@@ -2890,8 +2922,10 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
status = decode_pathname(xdr, &res->fs_path);
if (unlikely(status != 0))
goto out;
- READ_BUF(4);
- READ32(n);
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ goto out_overflow;
+ n = be32_to_cpup(p);
if (n <= 0)
goto out_eio;
res->nlocations = 0;
@@ -2899,8 +2933,10 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
u32 m;
struct nfs4_fs_location *loc = &res->locations[res->nlocations];
- READ_BUF(4);
- READ32(m);
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ goto out_overflow;
+ m = be32_to_cpup(p);
loc->nservers = 0;
dprintk("%s: servers ", __func__);
@@ -2939,6 +2975,8 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
out:
dprintk("%s: fs_locations done, error = %d\n", __func__, status);
return status;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
out_eio:
status = -EIO;
goto out;
@@ -2953,12 +2991,17 @@ static int decode_attr_maxfilesize(struct xdr_stream *xdr, uint32_t *bitmap, uin
if (unlikely(bitmap[0] & (FATTR4_WORD0_MAXFILESIZE - 1U)))
return -EIO;
if (likely(bitmap[0] & FATTR4_WORD0_MAXFILESIZE)) {
- READ_BUF(8);
- READ64(*res);
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ goto out_overflow;
+ xdr_decode_hyper(p, res);
bitmap[0] &= ~FATTR4_WORD0_MAXFILESIZE;
}
dprintk("%s: maxfilesize=%Lu\n", __func__, (unsigned long long)*res);
return status;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
static int decode_attr_maxlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *maxlink)
@@ -2970,12 +3013,17 @@ static int decode_attr_maxlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_
if (unlikely(bitmap[0] & (FATTR4_WORD0_MAXLINK - 1U)))
return -EIO;
if (likely(bitmap[0] & FATTR4_WORD0_MAXLINK)) {
- READ_BUF(4);
- READ32(*maxlink);
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ goto out_overflow;
+ *maxlink = be32_to_cpup(p);
bitmap[0] &= ~FATTR4_WORD0_MAXLINK;
}
dprintk("%s: maxlink=%u\n", __func__, *maxlink);
return status;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
static int decode_attr_maxname(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *maxname)
@@ -2987,12 +3035,17 @@ static int decode_attr_maxname(struct xdr_stream *xdr, uint32_t *bitmap, uint32_
if (unlikely(bitmap[0] & (FATTR4_WORD0_MAXNAME - 1U)))
return -EIO;
if (likely(bitmap[0] & FATTR4_WORD0_MAXNAME)) {
- READ_BUF(4);
- READ32(*maxname);
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ goto out_overflow;
+ *maxname = be32_to_cpup(p);
bitmap[0] &= ~FATTR4_WORD0_MAXNAME;
}
dprintk("%s: maxname=%u\n", __func__, *maxname);
return status;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
static int decode_attr_maxread(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -3005,8 +3058,10 @@ static int decode_attr_maxread(struct xdr_stream *xdr, uint32_t *bitmap, uint32_
return -EIO;
if (likely(bitmap[0] & FATTR4_WORD0_MAXREAD)) {
uint64_t maxread;
- READ_BUF(8);
- READ64(maxread);
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ goto out_overflow;
+ xdr_decode_hyper(p, &maxread);
if (maxread > 0x7FFFFFFF)
maxread = 0x7FFFFFFF;
*res = (uint32_t)maxread;
@@ -3014,6 +3069,9 @@ static int decode_attr_maxread(struct xdr_stream *xdr, uint32_t *bitmap, uint32_
}
dprintk("%s: maxread=%lu\n", __func__, (unsigned long)*res);
return status;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -3026,8 +3084,10 @@ static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32
return -EIO;
if (likely(bitmap[0] & FATTR4_WORD0_MAXWRITE)) {
uint64_t maxwrite;
- READ_BUF(8);
- READ64(maxwrite);
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ goto out_overflow;
+ xdr_decode_hyper(p, &maxwrite);
if (maxwrite > 0x7FFFFFFF)
maxwrite = 0x7FFFFFFF;
*res = (uint32_t)maxwrite;
@@ -3035,6 +3095,9 @@ static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32
}
dprintk("%s: maxwrite=%lu\n", __func__, (unsigned long)*res);
return status;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, umode_t *mode)
@@ -3047,14 +3110,19 @@ static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, umode_t *m
if (unlikely(bitmap[1] & (FATTR4_WORD1_MODE - 1U)))
return -EIO;
if (likely(bitmap[1] & FATTR4_WORD1_MODE)) {
- READ_BUF(4);
- READ32(tmp);
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ goto out_overflow;
+ tmp = be32_to_cpup(p);
*mode = tmp & ~S_IFMT;
bitmap[1] &= ~FATTR4_WORD1_MODE;
ret = NFS_ATTR_FATTR_MODE;
}
dprintk("%s: file mode=0%o\n", __func__, (unsigned int)*mode);
return ret;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *nlink)
@@ -3066,16 +3134,22 @@ static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t
if (unlikely(bitmap[1] & (FATTR4_WORD1_NUMLINKS - 1U)))
return -EIO;
if (likely(bitmap[1] & FATTR4_WORD1_NUMLINKS)) {
- READ_BUF(4);
- READ32(*nlink);
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ goto out_overflow;
+ *nlink = be32_to_cpup(p);
bitmap[1] &= ~FATTR4_WORD1_NUMLINKS;
ret = NFS_ATTR_FATTR_NLINK;
}
dprintk("%s: nlink=%u\n", __func__, (unsigned int)*nlink);
return ret;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
-static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_client *clp, uint32_t *uid)
+static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap,
+ struct nfs_client *clp, uint32_t *uid, int may_sleep)
{
uint32_t len;
__be32 *p;
@@ -3085,10 +3159,16 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER - 1U)))
return -EIO;
if (likely(bitmap[1] & FATTR4_WORD1_OWNER)) {
- READ_BUF(4);
- READ32(len);
- READ_BUF(len);
- if (len < XDR_MAX_NETOBJ) {
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ goto out_overflow;
+ len = be32_to_cpup(p);
+ p = xdr_inline_decode(xdr, len);
+ if (unlikely(!p))
+ goto out_overflow;
+ if (!may_sleep) {
+ /* do nothing */
+ } else if (len < XDR_MAX_NETOBJ) {
if (nfs_map_name_to_uid(clp, (char *)p, len, uid) == 0)
ret = NFS_ATTR_FATTR_OWNER;
else
@@ -3101,9 +3181,13 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
}
dprintk("%s: uid=%d\n", __func__, (int)*uid);
return ret;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
-static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_client *clp, uint32_t *gid)
+static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap,
+ struct nfs_client *clp, uint32_t *gid, int may_sleep)
{
uint32_t len;
__be32 *p;
@@ -3113,10 +3197,16 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER_GROUP - 1U)))
return -EIO;
if (likely(bitmap[1] & FATTR4_WORD1_OWNER_GROUP)) {
- READ_BUF(4);
- READ32(len);
- READ_BUF(len);
- if (len < XDR_MAX_NETOBJ) {
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ goto out_overflow;
+ len = be32_to_cpup(p);
+ p = xdr_inline_decode(xdr, len);
+ if (unlikely(!p))
+ goto out_overflow;
+ if (!may_sleep) {
+ /* do nothing */
+ } else if (len < XDR_MAX_NETOBJ) {
if (nfs_map_group_to_gid(clp, (char *)p, len, gid) == 0)
ret = NFS_ATTR_FATTR_GROUP;
else
@@ -3129,6 +3219,9 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
}
dprintk("%s: gid=%d\n", __func__, (int)*gid);
return ret;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rdev)
@@ -3143,9 +3236,11 @@ static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rde
if (likely(bitmap[1] & FATTR4_WORD1_RAWDEV)) {
dev_t tmp;
- READ_BUF(8);
- READ32(major);
- READ32(minor);
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ goto out_overflow;
+ major = be32_to_cpup(p++);
+ minor = be32_to_cpup(p);
tmp = MKDEV(major, minor);
if (MAJOR(tmp) == major && MINOR(tmp) == minor)
*rdev = tmp;
@@ -3154,6 +3249,9 @@ static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rde
}
dprintk("%s: rdev=(0x%x:0x%x)\n", __func__, major, minor);
return ret;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
static int decode_attr_space_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -3165,12 +3263,17 @@ static int decode_attr_space_avail(struct xdr_stream *xdr, uint32_t *bitmap, uin
if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_AVAIL - 1U)))
return -EIO;
if (likely(bitmap[1] & FATTR4_WORD1_SPACE_AVAIL)) {
- READ_BUF(8);
- READ64(*res);
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ goto out_overflow;
+ xdr_decode_hyper(p, res);
bitmap[1] &= ~FATTR4_WORD1_SPACE_AVAIL;
}
dprintk("%s: space avail=%Lu\n", __func__, (unsigned long long)*res);
return status;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
static int decode_attr_space_free(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -3182,12 +3285,17 @@ static int decode_attr_space_free(struct xdr_stream *xdr, uint32_t *bitmap, uint
if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_FREE - 1U)))
return -EIO;
if (likely(bitmap[1] & FATTR4_WORD1_SPACE_FREE)) {
- READ_BUF(8);
- READ64(*res);
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ goto out_overflow;
+ xdr_decode_hyper(p, res);
bitmap[1] &= ~FATTR4_WORD1_SPACE_FREE;
}
dprintk("%s: space free=%Lu\n", __func__, (unsigned long long)*res);
return status;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
static int decode_attr_space_total(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -3199,12 +3307,17 @@ static int decode_attr_space_total(struct xdr_stream *xdr, uint32_t *bitmap, uin
if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_TOTAL - 1U)))
return -EIO;
if (likely(bitmap[1] & FATTR4_WORD1_SPACE_TOTAL)) {
- READ_BUF(8);
- READ64(*res);
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ goto out_overflow;
+ xdr_decode_hyper(p, res);
bitmap[1] &= ~FATTR4_WORD1_SPACE_TOTAL;
}
dprintk("%s: space total=%Lu\n", __func__, (unsigned long long)*res);
return status;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *used)
@@ -3216,14 +3329,19 @@ static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint
if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_USED - 1U)))
return -EIO;
if (likely(bitmap[1] & FATTR4_WORD1_SPACE_USED)) {
- READ_BUF(8);
- READ64(*used);
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ goto out_overflow;
+ xdr_decode_hyper(p, used);
bitmap[1] &= ~FATTR4_WORD1_SPACE_USED;
ret = NFS_ATTR_FATTR_SPACE_USED;
}
dprintk("%s: space used=%Lu\n", __func__,
(unsigned long long)*used);
return ret;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
static int decode_attr_time(struct xdr_stream *xdr, struct timespec *time)
@@ -3232,12 +3350,17 @@ static int decode_attr_time(struct xdr_stream *xdr, struct timespec *time)
uint64_t sec;
uint32_t nsec;
- READ_BUF(12);
- READ64(sec);
- READ32(nsec);
+ p = xdr_inline_decode(xdr, 12);
+ if (unlikely(!p))
+ goto out_overflow;
+ p = xdr_decode_hyper(p, &sec);
+ nsec = be32_to_cpup(p);
time->tv_sec = (time_t)sec;
time->tv_nsec = (long)nsec;
return 0;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
static int decode_attr_time_access(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec *time)
@@ -3315,11 +3438,16 @@ static int decode_change_info(struct xdr_stream *xdr, struct nfs4_change_info *c
{
__be32 *p;
- READ_BUF(20);
- READ32(cinfo->atomic);
- READ64(cinfo->before);
- READ64(cinfo->after);
+ p = xdr_inline_decode(xdr, 20);
+ if (unlikely(!p))
+ goto out_overflow;
+ cinfo->atomic = be32_to_cpup(p++);
+ p = xdr_decode_hyper(p, &cinfo->before);
+ xdr_decode_hyper(p, &cinfo->after);
return 0;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
static int decode_access(struct xdr_stream *xdr, struct nfs4_accessres *access)
@@ -3331,40 +3459,62 @@ static int decode_access(struct xdr_stream *xdr, struct nfs4_accessres *access)
status = decode_op_hdr(xdr, OP_ACCESS);
if (status)
return status;
- READ_BUF(8);
- READ32(supp);
- READ32(acc);
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ goto out_overflow;
+ supp = be32_to_cpup(p++);
+ acc = be32_to_cpup(p);
access->supported = supp;
access->access = acc;
return 0;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
-static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res)
+static int decode_opaque_fixed(struct xdr_stream *xdr, void *buf, size_t len)
{
__be32 *p;
+
+ p = xdr_inline_decode(xdr, len);
+ if (likely(p)) {
+ memcpy(buf, p, len);
+ return 0;
+ }
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
+}
+
+static int decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
+{
+ return decode_opaque_fixed(xdr, stateid->data, NFS4_STATEID_SIZE);
+}
+
+static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res)
+{
int status;
status = decode_op_hdr(xdr, OP_CLOSE);
if (status != -EIO)
nfs_increment_open_seqid(status, res->seqid);
- if (status)
- return status;
- READ_BUF(NFS4_STATEID_SIZE);
- COPYMEM(res->stateid.data, NFS4_STATEID_SIZE);
- return 0;
+ if (!status)
+ status = decode_stateid(xdr, &res->stateid);
+ return status;
+}
+
+static int decode_verifier(struct xdr_stream *xdr, void *verifier)
+{
+ return decode_opaque_fixed(xdr, verifier, 8);
}
static int decode_commit(struct xdr_stream *xdr, struct nfs_writeres *res)
{
- __be32 *p;
int status;
status = decode_op_hdr(xdr, OP_COMMIT);
- if (status)
- return status;
- READ_BUF(8);
- COPYMEM(res->verf->verifier, 8);
- return 0;
+ if (!status)
+ status = decode_verifier(xdr, res->verf->verifier);
+ return status;
}
static int decode_create(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
@@ -3378,10 +3528,16 @@ static int decode_create(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
return status;
if ((status = decode_change_info(xdr, cinfo)))
return status;
- READ_BUF(4);
- READ32(bmlen);
- READ_BUF(bmlen << 2);
- return 0;
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ goto out_overflow;
+ bmlen = be32_to_cpup(p);
+ p = xdr_inline_decode(xdr, bmlen << 2);
+ if (likely(p))
+ return 0;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_res *res)
@@ -3466,7 +3622,8 @@ xdr_error:
return status;
}
-static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, const struct nfs_server *server)
+static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr,
+ const struct nfs_server *server, int may_sleep)
{
__be32 *savep;
uint32_t attrlen,
@@ -3538,12 +3695,14 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, cons
goto xdr_error;
fattr->valid |= status;
- status = decode_attr_owner(xdr, bitmap, server->nfs_client, &fattr->uid);
+ status = decode_attr_owner(xdr, bitmap, server->nfs_client,
+ &fattr->uid, may_sleep);
if (status < 0)
goto xdr_error;
fattr->valid |= status;
- status = decode_attr_group(xdr, bitmap, server->nfs_client, &fattr->gid);
+ status = decode_attr_group(xdr, bitmap, server->nfs_client,
+ &fattr->gid, may_sleep);
if (status < 0)
goto xdr_error;
fattr->valid |= status;
@@ -3633,14 +3792,21 @@ static int decode_getfh(struct xdr_stream *xdr, struct nfs_fh *fh)
if (status)
return status;
- READ_BUF(4);
- READ32(len);
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ goto out_overflow;
+ len = be32_to_cpup(p);
if (len > NFS4_FHSIZE)
return -EIO;
fh->size = len;
- READ_BUF(len);
- COPYMEM(fh->data, len);
+ p = xdr_inline_decode(xdr, len);
+ if (unlikely(!p))
+ goto out_overflow;
+ memcpy(fh->data, p, len);
return 0;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
static int decode_link(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
@@ -3662,10 +3828,12 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl)
__be32 *p;
uint32_t namelen, type;
- READ_BUF(32);
- READ64(offset);
- READ64(length);
- READ32(type);
+ p = xdr_inline_decode(xdr, 32);
+ if (unlikely(!p))
+ goto out_overflow;
+ p = xdr_decode_hyper(p, &offset);
+ p = xdr_decode_hyper(p, &length);
+ type = be32_to_cpup(p++);
if (fl != NULL) {
fl->fl_start = (loff_t)offset;
fl->fl_end = fl->fl_start + (loff_t)length - 1;
@@ -3676,23 +3844,27 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl)
fl->fl_type = F_RDLCK;
fl->fl_pid = 0;
}
- READ64(clientid);
- READ32(namelen);
- READ_BUF(namelen);
- return -NFS4ERR_DENIED;
+ p = xdr_decode_hyper(p, &clientid);
+ namelen = be32_to_cpup(p);
+ p = xdr_inline_decode(xdr, namelen);
+ if (likely(p))
+ return -NFS4ERR_DENIED;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
static int decode_lock(struct xdr_stream *xdr, struct nfs_lock_res *res)
{
- __be32 *p;
int status;
status = decode_op_hdr(xdr, OP_LOCK);
if (status == -EIO)
goto out;
if (status == 0) {
- READ_BUF(NFS4_STATEID_SIZE);
- COPYMEM(res->stateid.data, NFS4_STATEID_SIZE);
+ status = decode_stateid(xdr, &res->stateid);
+ if (unlikely(status))
+ goto out;
} else if (status == -NFS4ERR_DENIED)
status = decode_lock_denied(xdr, NULL);
if (res->open_seqid != NULL)
@@ -3713,16 +3885,13 @@ static int decode_lockt(struct xdr_stream *xdr, struct nfs_lockt_res *res)
static int decode_locku(struct xdr_stream *xdr, struct nfs_locku_res *res)
{
- __be32 *p;
int status;
status = decode_op_hdr(xdr, OP_LOCKU);
if (status != -EIO)
nfs_increment_lock_seqid(status, res->seqid);
- if (status == 0) {
- READ_BUF(NFS4_STATEID_SIZE);
- COPYMEM(res->stateid.data, NFS4_STATEID_SIZE);
- }
+ if (status == 0)
+ status = decode_stateid(xdr, &res->stateid);
return status;
}
@@ -3737,34 +3906,46 @@ static int decode_space_limit(struct xdr_stream *xdr, u64 *maxsize)
__be32 *p;
uint32_t limit_type, nblocks, blocksize;
- READ_BUF(12);
- READ32(limit_type);
+ p = xdr_inline_decode(xdr, 12);
+ if (unlikely(!p))
+ goto out_overflow;
+ limit_type = be32_to_cpup(p++);
switch (limit_type) {
case 1:
- READ64(*maxsize);
+ xdr_decode_hyper(p, maxsize);
break;
case 2:
- READ32(nblocks);
- READ32(blocksize);
+ nblocks = be32_to_cpup(p++);
+ blocksize = be32_to_cpup(p);
*maxsize = (uint64_t)nblocks * (uint64_t)blocksize;
}
return 0;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
{
__be32 *p;
uint32_t delegation_type;
+ int status;
- READ_BUF(4);
- READ32(delegation_type);
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ goto out_overflow;
+ delegation_type = be32_to_cpup(p);
if (delegation_type == NFS4_OPEN_DELEGATE_NONE) {
res->delegation_type = 0;
return 0;
}
- READ_BUF(NFS4_STATEID_SIZE+4);
- COPYMEM(res->delegation.data, NFS4_STATEID_SIZE);
- READ32(res->do_recall);
+ status = decode_stateid(xdr, &res->delegation);
+ if (unlikely(status))
+ return status;
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ goto out_overflow;
+ res->do_recall = be32_to_cpup(p);
switch (delegation_type) {
case NFS4_OPEN_DELEGATE_READ:
@@ -3776,6 +3957,9 @@ static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
return -EIO;
}
return decode_ace(xdr, NULL, res->server->nfs_client);
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
@@ -3787,23 +3971,27 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
status = decode_op_hdr(xdr, OP_OPEN);
if (status != -EIO)
nfs_increment_open_seqid(status, res->seqid);
- if (status)
+ if (!status)
+ status = decode_stateid(xdr, &res->stateid);
+ if (unlikely(status))
return status;
- READ_BUF(NFS4_STATEID_SIZE);
- COPYMEM(res->stateid.data, NFS4_STATEID_SIZE);
decode_change_info(xdr, &res->cinfo);
- READ_BUF(8);
- READ32(res->rflags);
- READ32(bmlen);
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ goto out_overflow;
+ res->rflags = be32_to_cpup(p++);
+ bmlen = be32_to_cpup(p);
if (bmlen > 10)
goto xdr_error;
- READ_BUF(bmlen << 2);
+ p = xdr_inline_decode(xdr, bmlen << 2);
+ if (unlikely(!p))
+ goto out_overflow;
savewords = min_t(uint32_t, bmlen, NFS4_BITMAP_SIZE);
for (i = 0; i < savewords; ++i)
- READ32(res->attrset[i]);
+ res->attrset[i] = be32_to_cpup(p++);
for (; i < NFS4_BITMAP_SIZE; i++)
res->attrset[i] = 0;
@@ -3811,36 +3999,33 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
xdr_error:
dprintk("%s: Bitmap too large! Length = %u\n", __func__, bmlen);
return -EIO;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
static int decode_open_confirm(struct xdr_stream *xdr, struct nfs_open_confirmres *res)
{
- __be32 *p;
int status;
status = decode_op_hdr(xdr, OP_OPEN_CONFIRM);
if (status != -EIO)
nfs_increment_open_seqid(status, res->seqid);
- if (status)
- return status;
- READ_BUF(NFS4_STATEID_SIZE);
- COPYMEM(res->stateid.data, NFS4_STATEID_SIZE);
- return 0;
+ if (!status)
+ status = decode_stateid(xdr, &res->stateid);
+ return status;
}
static int decode_open_downgrade(struct xdr_stream *xdr, struct nfs_closeres *res)
{
- __be32 *p;
int status;
status = decode_op_hdr(xdr, OP_OPEN_DOWNGRADE);
if (status != -EIO)
nfs_increment_open_seqid(status, res->seqid);
- if (status)
- return status;
- READ_BUF(NFS4_STATEID_SIZE);
- COPYMEM(res->stateid.data, NFS4_STATEID_SIZE);
- return 0;
+ if (!status)
+ status = decode_stateid(xdr, &res->stateid);
+ return status;
}
static int decode_putfh(struct xdr_stream *xdr)
@@ -3863,9 +4048,11 @@ static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs_
status = decode_op_hdr(xdr, OP_READ);
if (status)
return status;
- READ_BUF(8);
- READ32(eof);
- READ32(count);
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ goto out_overflow;
+ eof = be32_to_cpup(p++);
+ count = be32_to_cpup(p);
hdrlen = (u8 *) p - (u8 *) iov->iov_base;
recvd = req->rq_rcv_buf.len - hdrlen;
if (count > recvd) {
@@ -3878,6 +4065,9 @@ static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs_
res->eof = eof;
res->count = count;
return 0;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs4_readdir_res *readdir)
@@ -3892,17 +4082,17 @@ static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct n
int status;
status = decode_op_hdr(xdr, OP_READDIR);
- if (status)
+ if (!status)
+ status = decode_verifier(xdr, readdir->verifier.data);
+ if (unlikely(status))
return status;
- READ_BUF(8);
- COPYMEM(readdir->verifier.data, 8);
dprintk("%s: verifier = %08x:%08x\n",
__func__,
((u32 *)readdir->verifier.data)[0],
((u32 *)readdir->verifier.data)[1]);
- hdrlen = (char *) p - (char *) iov->iov_base;
+ hdrlen = (char *) xdr->p - (char *) iov->iov_base;
recvd = rcvbuf->len - hdrlen;
if (pglen > recvd)
pglen = recvd;
@@ -3990,8 +4180,10 @@ static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req)
return status;
/* Convert length of symlink */
- READ_BUF(4);
- READ32(len);
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ goto out_overflow;
+ len = be32_to_cpup(p);
if (len >= rcvbuf->page_len || len <= 0) {
dprintk("nfs: server returned giant symlink!\n");
return -ENAMETOOLONG;
@@ -4015,6 +4207,9 @@ static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req)
kaddr[len+rcvbuf->page_base] = '\0';
kunmap_atomic(kaddr, KM_USER0);
return 0;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
static int decode_remove(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
@@ -4112,10 +4307,16 @@ static int decode_setattr(struct xdr_stream *xdr)
status = decode_op_hdr(xdr, OP_SETATTR);
if (status)
return status;
- READ_BUF(4);
- READ32(bmlen);
- READ_BUF(bmlen << 2);
- return 0;
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ goto out_overflow;
+ bmlen = be32_to_cpup(p);
+ p = xdr_inline_decode(xdr, bmlen << 2);
+ if (likely(p))
+ return 0;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp)
@@ -4124,35 +4325,50 @@ static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp)
uint32_t opnum;
int32_t nfserr;
- READ_BUF(8);
- READ32(opnum);
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ goto out_overflow;
+ opnum = be32_to_cpup(p++);
if (opnum != OP_SETCLIENTID) {
dprintk("nfs: decode_setclientid: Server returned operation"
" %d\n", opnum);
return -EIO;
}
- READ32(nfserr);
+ nfserr = be32_to_cpup(p);
if (nfserr == NFS_OK) {
- READ_BUF(8 + NFS4_VERIFIER_SIZE);
- READ64(clp->cl_clientid);
- COPYMEM(clp->cl_confirm.data, NFS4_VERIFIER_SIZE);
+ p = xdr_inline_decode(xdr, 8 + NFS4_VERIFIER_SIZE);
+ if (unlikely(!p))
+ goto out_overflow;
+ p = xdr_decode_hyper(p, &clp->cl_clientid);
+ memcpy(clp->cl_confirm.data, p, NFS4_VERIFIER_SIZE);
} else if (nfserr == NFSERR_CLID_INUSE) {
uint32_t len;
/* skip netid string */
- READ_BUF(4);
- READ32(len);
- READ_BUF(len);
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ goto out_overflow;
+ len = be32_to_cpup(p);
+ p = xdr_inline_decode(xdr, len);
+ if (unlikely(!p))
+ goto out_overflow;
/* skip uaddr string */
- READ_BUF(4);
- READ32(len);
- READ_BUF(len);
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ goto out_overflow;
+ len = be32_to_cpup(p);
+ p = xdr_inline_decode(xdr, len);
+ if (unlikely(!p))
+ goto out_overflow;
return -NFSERR_CLID_INUSE;
} else
return nfs4_stat_to_errno(nfserr);
return 0;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
static int decode_setclientid_confirm(struct xdr_stream *xdr)
@@ -4169,11 +4385,16 @@ static int decode_write(struct xdr_stream *xdr, struct nfs_writeres *res)
if (status)
return status;
- READ_BUF(16);
- READ32(res->count);
- READ32(res->verf->committed);
- COPYMEM(res->verf->verifier, 8);
+ p = xdr_inline_decode(xdr, 16);
+ if (unlikely(!p))
+ goto out_overflow;
+ res->count = be32_to_cpup(p++);
+ res->verf->committed = be32_to_cpup(p++);
+ memcpy(res->verf->verifier, p, 8);
return 0;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
static int decode_delegreturn(struct xdr_stream *xdr)
@@ -4187,6 +4408,7 @@ static int decode_exchange_id(struct xdr_stream *xdr,
{
__be32 *p;
uint32_t dummy;
+ char *dummy_str;
int status;
struct nfs_client *clp = res->client;
@@ -4194,36 +4416,45 @@ static int decode_exchange_id(struct xdr_stream *xdr,
if (status)
return status;
- READ_BUF(8);
- READ64(clp->cl_ex_clid);
- READ_BUF(12);
- READ32(clp->cl_seqid);
- READ32(clp->cl_exchange_flags);
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ goto out_overflow;
+ xdr_decode_hyper(p, &clp->cl_ex_clid);
+ p = xdr_inline_decode(xdr, 12);
+ if (unlikely(!p))
+ goto out_overflow;
+ clp->cl_seqid = be32_to_cpup(p++);
+ clp->cl_exchange_flags = be32_to_cpup(p++);
/* We ask for SP4_NONE */
- READ32(dummy);
+ dummy = be32_to_cpup(p);
if (dummy != SP4_NONE)
return -EIO;
/* Throw away minor_id */
- READ_BUF(8);
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ goto out_overflow;
/* Throw away Major id */
- READ_BUF(4);
- READ32(dummy);
- READ_BUF(dummy);
+ status = decode_opaque_inline(xdr, &dummy, &dummy_str);
+ if (unlikely(status))
+ return status;
/* Throw away server_scope */
- READ_BUF(4);
- READ32(dummy);
- READ_BUF(dummy);
+ status = decode_opaque_inline(xdr, &dummy, &dummy_str);
+ if (unlikely(status))
+ return status;
/* Throw away Implementation id array */
- READ_BUF(4);
- READ32(dummy);
- READ_BUF(dummy);
+ status = decode_opaque_inline(xdr, &dummy, &dummy_str);
+ if (unlikely(status))
+ return status;
return 0;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
static int decode_chan_attrs(struct xdr_stream *xdr,
@@ -4232,22 +4463,35 @@ static int decode_chan_attrs(struct xdr_stream *xdr,
__be32 *p;
u32 nr_attrs;
- READ_BUF(28);
- READ32(attrs->headerpadsz);
- READ32(attrs->max_rqst_sz);
- READ32(attrs->max_resp_sz);
- READ32(attrs->max_resp_sz_cached);
- READ32(attrs->max_ops);
- READ32(attrs->max_reqs);
- READ32(nr_attrs);
+ p = xdr_inline_decode(xdr, 28);
+ if (unlikely(!p))
+ goto out_overflow;
+ attrs->headerpadsz = be32_to_cpup(p++);
+ attrs->max_rqst_sz = be32_to_cpup(p++);
+ attrs->max_resp_sz = be32_to_cpup(p++);
+ attrs->max_resp_sz_cached = be32_to_cpup(p++);
+ attrs->max_ops = be32_to_cpup(p++);
+ attrs->max_reqs = be32_to_cpup(p++);
+ nr_attrs = be32_to_cpup(p);
if (unlikely(nr_attrs > 1)) {
printk(KERN_WARNING "%s: Invalid rdma channel attrs count %u\n",
__func__, nr_attrs);
return -EINVAL;
}
- if (nr_attrs == 1)
- READ_BUF(4); /* skip rdma_attrs */
+ if (nr_attrs == 1) {
+ p = xdr_inline_decode(xdr, 4); /* skip rdma_attrs */
+ if (unlikely(!p))
+ goto out_overflow;
+ }
return 0;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
+}
+
+static int decode_sessionid(struct xdr_stream *xdr, struct nfs4_sessionid *sid)
+{
+ return decode_opaque_fixed(xdr, sid->data, NFS4_MAX_SESSIONID_LEN);
}
static int decode_create_session(struct xdr_stream *xdr,
@@ -4259,24 +4503,26 @@ static int decode_create_session(struct xdr_stream *xdr,
struct nfs4_session *session = clp->cl_session;
status = decode_op_hdr(xdr, OP_CREATE_SESSION);
-
- if (status)
+ if (!status)
+ status = decode_sessionid(xdr, &session->sess_id);
+ if (unlikely(status))
return status;
- /* sessionid */
- READ_BUF(NFS4_MAX_SESSIONID_LEN);
- COPYMEM(&session->sess_id, NFS4_MAX_SESSIONID_LEN);
-
/* seqid, flags */
- READ_BUF(8);
- READ32(clp->cl_seqid);
- READ32(session->flags);
+ p = xdr_inline_decode(xdr, 8);
+ if (unlikely(!p))
+ goto out_overflow;
+ clp->cl_seqid = be32_to_cpup(p++);
+ session->flags = be32_to_cpup(p);
/* Channel attributes */
status = decode_chan_attrs(xdr, &session->fc_attrs);
if (!status)
status = decode_chan_attrs(xdr, &session->bc_attrs);
return status;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
}
static int decode_destroy_session(struct xdr_stream *xdr, void *dummy)
@@ -4300,7 +4546,9 @@ static int decode_sequence(struct xdr_stream *xdr,
return 0;
status = decode_op_hdr(xdr, OP_SEQUENCE);
- if (status)
+ if (!status)
+ status = decode_sessionid(xdr, &id);
+ if (unlikely(status))
goto out_err;
/*
@@ -4309,36 +4557,43 @@ static int decode_sequence(struct xdr_stream *xdr,
*/
status = -ESERVERFAULT;
- slot = &res->sr_session->fc_slot_table.slots[res->sr_slotid];
- READ_BUF(NFS4_MAX_SESSIONID_LEN + 20);
- COPYMEM(id.data, NFS4_MAX_SESSIONID_LEN);
if (memcmp(id.data, res->sr_session->sess_id.data,
NFS4_MAX_SESSIONID_LEN)) {
dprintk("%s Invalid session id\n", __func__);
goto out_err;
}
+
+ p = xdr_inline_decode(xdr, 20);
+ if (unlikely(!p))
+ goto out_overflow;
+
/* seqid */
- READ32(dummy);
+ slot = &res->sr_session->fc_slot_table.slots[res->sr_slotid];
+ dummy = be32_to_cpup(p++);
if (dummy != slot->seq_nr) {
dprintk("%s Invalid sequence number\n", __func__);
goto out_err;
}
/* slot id */
- READ32(dummy);
+ dummy = be32_to_cpup(p++);
if (dummy != res->sr_slotid) {
dprintk("%s Invalid slot id\n", __func__);
goto out_err;
}
/* highest slot id - currently not processed */
- READ32(dummy);
+ dummy = be32_to_cpup(p++);
/* target highest slot id - currently not processed */
- READ32(dummy);
+ dummy = be32_to_cpup(p++);
/* result flags - currently not processed */
- READ32(dummy);
+ dummy = be32_to_cpup(p);
status = 0;
out_err:
res->sr_status = status;
return status;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ status = -EIO;
+ goto out_err;
#else /* CONFIG_NFS_V4_1 */
return 0;
#endif /* CONFIG_NFS_V4_1 */
@@ -4370,7 +4625,8 @@ static int nfs4_xdr_dec_open_downgrade(struct rpc_rqst *rqstp, __be32 *p, struct
status = decode_open_downgrade(&xdr, res);
if (status != 0)
goto out;
- decode_getfattr(&xdr, res->fattr, res->server);
+ decode_getfattr(&xdr, res->fattr, res->server,
+ !RPC_IS_ASYNC(rqstp->rq_task));
out:
return status;
}
@@ -4397,7 +4653,8 @@ static int nfs4_xdr_dec_access(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_ac
status = decode_access(&xdr, res);
if (status != 0)
goto out;
- decode_getfattr(&xdr, res->fattr, res->server);
+ decode_getfattr(&xdr, res->fattr, res->server,
+ !RPC_IS_ASYNC(rqstp->rq_task));
out:
return status;
}
@@ -4424,7 +4681,8 @@ static int nfs4_xdr_dec_lookup(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_lo
goto out;
if ((status = decode_getfh(&xdr, res->fh)) != 0)
goto out;
- status = decode_getfattr(&xdr, res->fattr, res->server);
+ status = decode_getfattr(&xdr, res->fattr, res->server
+ ,!RPC_IS_ASYNC(rqstp->rq_task));
out:
return status;
}
@@ -4448,7 +4706,8 @@ static int nfs4_xdr_dec_lookup_root(struct rpc_rqst *rqstp, __be32 *p, struct nf
if ((status = decode_putrootfh(&xdr)) != 0)
goto out;
if ((status = decode_getfh(&xdr, res->fh)) == 0)
- status = decode_getfattr(&xdr, res->fattr, res->server);
+ status = decode_getfattr(&xdr, res->fattr, res->server,
+ !RPC_IS_ASYNC(rqstp->rq_task));
out:
return status;
}
@@ -4473,7 +4732,8 @@ static int nfs4_xdr_dec_remove(struct rpc_rqst *rqstp, __be32 *p, struct nfs_rem
goto out;
if ((status = decode_remove(&xdr, &res->cinfo)) != 0)
goto out;
- decode_getfattr(&xdr, &res->dir_attr, res->server);
+ decode_getfattr(&xdr, &res->dir_attr, res->server,
+ !RPC_IS_ASYNC(rqstp->rq_task));
out:
return status;
}
@@ -4503,11 +4763,13 @@ static int nfs4_xdr_dec_rename(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_re
if ((status = decode_rename(&xdr, &res->old_cinfo, &res->new_cinfo)) != 0)
goto out;
/* Current FH is target directory */
- if (decode_getfattr(&xdr, res->new_fattr, res->server) != 0)
+ if (decode_getfattr(&xdr, res->new_fattr, res->server,
+ !RPC_IS_ASYNC(rqstp->rq_task)) != 0)
goto out;
if ((status = decode_restorefh(&xdr)) != 0)
goto out;
- decode_getfattr(&xdr, res->old_fattr, res->server);
+ decode_getfattr(&xdr, res->old_fattr, res->server,
+ !RPC_IS_ASYNC(rqstp->rq_task));
out:
return status;
}
@@ -4540,11 +4802,13 @@ static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_link
* Note order: OP_LINK leaves the directory as the current
* filehandle.
*/
- if (decode_getfattr(&xdr, res->dir_attr, res->server) != 0)
+ if (decode_getfattr(&xdr, res->dir_attr, res->server,
+ !RPC_IS_ASYNC(rqstp->rq_task)) != 0)
goto out;
if ((status = decode_restorefh(&xdr)) != 0)
goto out;
- decode_getfattr(&xdr, res->fattr, res->server);
+ decode_getfattr(&xdr, res->fattr, res->server,
+ !RPC_IS_ASYNC(rqstp->rq_task));
out:
return status;
}
@@ -4573,11 +4837,13 @@ static int nfs4_xdr_dec_create(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_cr
goto out;
if ((status = decode_getfh(&xdr, res->fh)) != 0)
goto out;
- if (decode_getfattr(&xdr, res->fattr, res->server) != 0)
+ if (decode_getfattr(&xdr, res->fattr, res->server,
+ !RPC_IS_ASYNC(rqstp->rq_task)) != 0)
goto out;
if ((status = decode_restorefh(&xdr)) != 0)
goto out;
- decode_getfattr(&xdr, res->dir_fattr, res->server);
+ decode_getfattr(&xdr, res->dir_fattr, res->server,
+ !RPC_IS_ASYNC(rqstp->rq_task));
out:
return status;
}
@@ -4609,7 +4875,8 @@ static int nfs4_xdr_dec_getattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_g
status = decode_putfh(&xdr);
if (status)
goto out;
- status = decode_getfattr(&xdr, res->fattr, res->server);
+ status = decode_getfattr(&xdr, res->fattr, res->server,
+ !RPC_IS_ASYNC(rqstp->rq_task));
out:
return status;
}
@@ -4716,7 +4983,8 @@ static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, __be32 *p, struct nfs_clos
* an ESTALE error. Shouldn't be a problem,
* though, since fattr->valid will remain unset.
*/
- decode_getfattr(&xdr, res->fattr, res->server);
+ decode_getfattr(&xdr, res->fattr, res->server,
+ !RPC_IS_ASYNC(rqstp->rq_task));
out:
return status;
}
@@ -4748,11 +5016,13 @@ static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, __be32 *p, struct nfs_openr
goto out;
if (decode_getfh(&xdr, &res->fh) != 0)
goto out;
- if (decode_getfattr(&xdr, res->f_attr, res->server) != 0)
+ if (decode_getfattr(&xdr, res->f_attr, res->server,
+ !RPC_IS_ASYNC(rqstp->rq_task)) != 0)
goto out;
if (decode_restorefh(&xdr) != 0)
goto out;
- decode_getfattr(&xdr, res->dir_attr, res->server);
+ decode_getfattr(&xdr, res->dir_attr, res->server,
+ !RPC_IS_ASYNC(rqstp->rq_task));
out:
return status;
}
@@ -4800,7 +5070,8 @@ static int nfs4_xdr_dec_open_noattr(struct rpc_rqst *rqstp, __be32 *p, struct nf
status = decode_open(&xdr, res);
if (status)
goto out;
- decode_getfattr(&xdr, res->f_attr, res->server);
+ decode_getfattr(&xdr, res->f_attr, res->server,
+ !RPC_IS_ASYNC(rqstp->rq_task));
out:
return status;
}
@@ -4827,7 +5098,8 @@ static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs_se
status = decode_setattr(&xdr);
if (status)
goto out;
- decode_getfattr(&xdr, res->fattr, res->server);
+ decode_getfattr(&xdr, res->fattr, res->server,
+ !RPC_IS_ASYNC(rqstp->rq_task));
out:
return status;
}
@@ -5001,7 +5273,8 @@ static int nfs4_xdr_dec_write(struct rpc_rqst *rqstp, __be32 *p, struct nfs_writ
status = decode_write(&xdr, res);
if (status)
goto out;
- decode_getfattr(&xdr, res->fattr, res->server);
+ decode_getfattr(&xdr, res->fattr, res->server,
+ !RPC_IS_ASYNC(rqstp->rq_task));
if (!status)
status = res->count;
out:
@@ -5030,7 +5303,8 @@ static int nfs4_xdr_dec_commit(struct rpc_rqst *rqstp, __be32 *p, struct nfs_wri
status = decode_commit(&xdr, res);
if (status)
goto out;
- decode_getfattr(&xdr, res->fattr, res->server);
+ decode_getfattr(&xdr, res->fattr, res->server,
+ !RPC_IS_ASYNC(rqstp->rq_task));
out:
return status;
}
@@ -5194,7 +5468,8 @@ static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp, __be32 *p, struct nf
if (status != 0)
goto out;
status = decode_delegreturn(&xdr);
- decode_getfattr(&xdr, res->fattr, res->server);
+ decode_getfattr(&xdr, res->fattr, res->server,
+ !RPC_IS_ASYNC(rqstp->rq_task));
out:
return status;
}
@@ -5222,7 +5497,8 @@ static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req, __be32 *p,
goto out;
xdr_enter_page(&xdr, PAGE_SIZE);
status = decode_getfattr(&xdr, &res->fs_locations->fattr,
- res->fs_locations->server);
+ res->fs_locations->server,
+ !RPC_IS_ASYNC(req->rq_task));
out:
return status;
}
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 73ea5e8d66ce..12c9e66d3f1d 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -60,17 +60,15 @@ struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount)
return p;
}
-static void nfs_readdata_free(struct nfs_read_data *p)
+void nfs_readdata_free(struct nfs_read_data *p)
{
if (p && (p->pagevec != &p->page_array[0]))
kfree(p->pagevec);
mempool_free(p, nfs_rdata_mempool);
}
-void nfs_readdata_release(void *data)
+static void nfs_readdata_release(struct nfs_read_data *rdata)
{
- struct nfs_read_data *rdata = data;
-
put_nfs_open_context(rdata->args.context);
nfs_readdata_free(rdata);
}
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 0b4cbdc60abd..f1cc0587cfef 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -73,7 +73,7 @@ enum {
Opt_cto, Opt_nocto,
Opt_ac, Opt_noac,
Opt_lock, Opt_nolock,
- Opt_v2, Opt_v3,
+ Opt_v2, Opt_v3, Opt_v4,
Opt_udp, Opt_tcp, Opt_rdma,
Opt_acl, Opt_noacl,
Opt_rdirplus, Opt_nordirplus,
@@ -127,6 +127,7 @@ static const match_table_t nfs_mount_option_tokens = {
{ Opt_nolock, "nolock" },
{ Opt_v2, "v2" },
{ Opt_v3, "v3" },
+ { Opt_v4, "v4" },
{ Opt_udp, "udp" },
{ Opt_tcp, "tcp" },
{ Opt_rdma, "rdma" },
@@ -158,7 +159,7 @@ static const match_table_t nfs_mount_option_tokens = {
{ Opt_mountvers, "mountvers=%s" },
{ Opt_nfsvers, "nfsvers=%s" },
{ Opt_nfsvers, "vers=%s" },
- { Opt_minorversion, "minorversion=%u" },
+ { Opt_minorversion, "minorversion=%s" },
{ Opt_sec, "sec=%s" },
{ Opt_proto, "proto=%s" },
@@ -272,6 +273,10 @@ static const struct super_operations nfs_sops = {
};
#ifdef CONFIG_NFS_V4
+static int nfs4_validate_text_mount_data(void *options,
+ struct nfs_parsed_mount_data *args, const char *dev_name);
+static int nfs4_try_mount(int flags, const char *dev_name,
+ struct nfs_parsed_mount_data *data, struct vfsmount *mnt);
static int nfs4_get_sb(struct file_system_type *fs_type,
int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
static int nfs4_remote_get_sb(struct file_system_type *fs_type,
@@ -742,127 +747,23 @@ static int nfs_verify_server_address(struct sockaddr *addr)
}
}
+ dfprintk(MOUNT, "NFS: Invalid IP address specified\n");
return 0;
}
-static void nfs_parse_ipv4_address(char *string, size_t str_len,
- struct sockaddr *sap, size_t *addr_len)
-{
- struct sockaddr_in *sin = (struct sockaddr_in *)sap;
- u8 *addr = (u8 *)&sin->sin_addr.s_addr;
-
- if (str_len <= INET_ADDRSTRLEN) {
- dfprintk(MOUNT, "NFS: parsing IPv4 address %*s\n",
- (int)str_len, string);
-
- sin->sin_family = AF_INET;
- *addr_len = sizeof(*sin);
- if (in4_pton(string, str_len, addr, '\0', NULL))
- return;
- }
-
- sap->sa_family = AF_UNSPEC;
- *addr_len = 0;
-}
-
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-static int nfs_parse_ipv6_scope_id(const char *string, const size_t str_len,
- const char *delim,
- struct sockaddr_in6 *sin6)
-{
- char *p;
- size_t len;
-
- if ((string + str_len) == delim)
- return 1;
-
- if (*delim != IPV6_SCOPE_DELIMITER)
- return 0;
-
- if (!(ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL))
- return 0;
-
- len = (string + str_len) - delim - 1;
- p = kstrndup(delim + 1, len, GFP_KERNEL);
- if (p) {
- unsigned long scope_id = 0;
- struct net_device *dev;
-
- dev = dev_get_by_name(&init_net, p);
- if (dev != NULL) {
- scope_id = dev->ifindex;
- dev_put(dev);
- } else {
- if (strict_strtoul(p, 10, &scope_id) == 0) {
- kfree(p);
- return 0;
- }
- }
-
- kfree(p);
-
- sin6->sin6_scope_id = scope_id;
- dfprintk(MOUNT, "NFS: IPv6 scope ID = %lu\n", scope_id);
- return 1;
- }
-
- return 0;
-}
-
-static void nfs_parse_ipv6_address(char *string, size_t str_len,
- struct sockaddr *sap, size_t *addr_len)
-{
- struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
- u8 *addr = (u8 *)&sin6->sin6_addr.in6_u;
- const char *delim;
-
- if (str_len <= INET6_ADDRSTRLEN) {
- dfprintk(MOUNT, "NFS: parsing IPv6 address %*s\n",
- (int)str_len, string);
-
- sin6->sin6_family = AF_INET6;
- *addr_len = sizeof(*sin6);
- if (in6_pton(string, str_len, addr,
- IPV6_SCOPE_DELIMITER, &delim) != 0) {
- if (nfs_parse_ipv6_scope_id(string, str_len,
- delim, sin6) != 0)
- return;
- }
- }
-
- sap->sa_family = AF_UNSPEC;
- *addr_len = 0;
-}
-#else
-static void nfs_parse_ipv6_address(char *string, size_t str_len,
- struct sockaddr *sap, size_t *addr_len)
-{
- sap->sa_family = AF_UNSPEC;
- *addr_len = 0;
-}
-#endif
-
/*
- * Construct a sockaddr based on the contents of a string that contains
- * an IP address in presentation format.
- *
- * If there is a problem constructing the new sockaddr, set the address
- * family to AF_UNSPEC.
+ * Select between a default port value and a user-specified port value.
+ * If a zero value is set, then autobind will be used.
*/
-void nfs_parse_ip_address(char *string, size_t str_len,
- struct sockaddr *sap, size_t *addr_len)
+static void nfs_set_default_port(struct sockaddr *sap, const int parsed_port,
+ const unsigned short default_port)
{
- unsigned int i, colons;
+ unsigned short port = default_port;
- colons = 0;
- for (i = 0; i < str_len; i++)
- if (string[i] == ':')
- colons++;
+ if (parsed_port != NFS_UNSPEC_PORT)
+ port = parsed_port;
- if (colons >= 2)
- nfs_parse_ipv6_address(string, str_len, sap, addr_len);
- else
- nfs_parse_ipv4_address(string, str_len, sap, addr_len);
+ rpc_set_port(sap, port);
}
/*
@@ -904,8 +805,6 @@ static void nfs_set_mount_transport_protocol(struct nfs_parsed_mount_data *mnt)
/*
* Parse the value of the 'sec=' option.
- *
- * The flavor_len setting is for v4 mounts.
*/
static int nfs_parse_security_flavors(char *value,
struct nfs_parsed_mount_data *mnt)
@@ -916,53 +815,43 @@ static int nfs_parse_security_flavors(char *value,
switch (match_token(value, nfs_secflavor_tokens, args)) {
case Opt_sec_none:
- mnt->auth_flavor_len = 0;
mnt->auth_flavors[0] = RPC_AUTH_NULL;
break;
case Opt_sec_sys:
- mnt->auth_flavor_len = 0;
mnt->auth_flavors[0] = RPC_AUTH_UNIX;
break;
case Opt_sec_krb5:
- mnt->auth_flavor_len = 1;
mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5;
break;
case Opt_sec_krb5i:
- mnt->auth_flavor_len = 1;
mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5I;
break;
case Opt_sec_krb5p:
- mnt->auth_flavor_len = 1;
mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5P;
break;
case Opt_sec_lkey:
- mnt->auth_flavor_len = 1;
mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEY;
break;
case Opt_sec_lkeyi:
- mnt->auth_flavor_len = 1;
mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEYI;
break;
case Opt_sec_lkeyp:
- mnt->auth_flavor_len = 1;
mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEYP;
break;
case Opt_sec_spkm:
- mnt->auth_flavor_len = 1;
mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKM;
break;
case Opt_sec_spkmi:
- mnt->auth_flavor_len = 1;
mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKMI;
break;
case Opt_sec_spkmp:
- mnt->auth_flavor_len = 1;
mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKMP;
break;
default:
return 0;
}
+ mnt->auth_flavor_len = 1;
return 1;
}
@@ -1001,7 +890,6 @@ static int nfs_parse_mount_options(char *raw,
while ((p = strsep(&raw, ",")) != NULL) {
substring_t args[MAX_OPT_ARGS];
unsigned long option;
- int int_option;
int token;
if (!*p)
@@ -1047,10 +935,18 @@ static int nfs_parse_mount_options(char *raw,
break;
case Opt_v2:
mnt->flags &= ~NFS_MOUNT_VER3;
+ mnt->version = 2;
break;
case Opt_v3:
mnt->flags |= NFS_MOUNT_VER3;
+ mnt->version = 3;
break;
+#ifdef CONFIG_NFS_V4
+ case Opt_v4:
+ mnt->flags &= ~NFS_MOUNT_VER3;
+ mnt->version = 4;
+ break;
+#endif
case Opt_udp:
mnt->flags &= ~NFS_MOUNT_TCP;
mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP;
@@ -1264,20 +1160,33 @@ static int nfs_parse_mount_options(char *raw,
switch (option) {
case NFS2_VERSION:
mnt->flags &= ~NFS_MOUNT_VER3;
+ mnt->version = 2;
break;
case NFS3_VERSION:
mnt->flags |= NFS_MOUNT_VER3;
+ mnt->version = 3;
break;
+#ifdef CONFIG_NFS_V4
+ case NFS4_VERSION:
+ mnt->flags &= ~NFS_MOUNT_VER3;
+ mnt->version = 4;
+ break;
+#endif
default:
goto out_invalid_value;
}
break;
case Opt_minorversion:
- if (match_int(args, &int_option))
- return 0;
- if (int_option < 0 || int_option > NFS4_MAX_MINOR_VERSION)
- return 0;
- mnt->minorversion = int_option;
+ string = match_strdup(args);
+ if (string == NULL)
+ goto out_nomem;
+ rc = strict_strtoul(string, 10, &option);
+ kfree(string);
+ if (rc != 0)
+ goto out_invalid_value;
+ if (option > NFS4_MAX_MINOR_VERSION)
+ goto out_invalid_value;
+ mnt->minorversion = option;
break;
/*
@@ -1352,11 +1261,14 @@ static int nfs_parse_mount_options(char *raw,
string = match_strdup(args);
if (string == NULL)
goto out_nomem;
- nfs_parse_ip_address(string, strlen(string),
- (struct sockaddr *)
- &mnt->nfs_server.address,
- &mnt->nfs_server.addrlen);
+ mnt->nfs_server.addrlen =
+ rpc_pton(string, strlen(string),
+ (struct sockaddr *)
+ &mnt->nfs_server.address,
+ sizeof(mnt->nfs_server.address));
kfree(string);
+ if (mnt->nfs_server.addrlen == 0)
+ goto out_invalid_address;
break;
case Opt_clientaddr:
string = match_strdup(args);
@@ -1376,11 +1288,14 @@ static int nfs_parse_mount_options(char *raw,
string = match_strdup(args);
if (string == NULL)
goto out_nomem;
- nfs_parse_ip_address(string, strlen(string),
- (struct sockaddr *)
- &mnt->mount_server.address,
- &mnt->mount_server.addrlen);
+ mnt->mount_server.addrlen =
+ rpc_pton(string, strlen(string),
+ (struct sockaddr *)
+ &mnt->mount_server.address,
+ sizeof(mnt->mount_server.address));
kfree(string);
+ if (mnt->mount_server.addrlen == 0)
+ goto out_invalid_address;
break;
case Opt_lookupcache:
string = match_strdup(args);
@@ -1432,8 +1347,11 @@ static int nfs_parse_mount_options(char *raw,
return 1;
+out_invalid_address:
+ printk(KERN_INFO "NFS: bad IP address specified: %s\n", p);
+ return 0;
out_invalid_value:
- printk(KERN_INFO "NFS: bad mount option value specified: %s \n", p);
+ printk(KERN_INFO "NFS: bad mount option value specified: %s\n", p);
return 0;
out_nomem:
printk(KERN_INFO "NFS: not enough memory to parse option\n");
@@ -1445,13 +1363,60 @@ out_security_failure:
}
/*
+ * Match the requested auth flavors with the list returned by
+ * the server. Returns zero and sets the mount's authentication
+ * flavor on success; returns -EACCES if server does not support
+ * the requested flavor.
+ */
+static int nfs_walk_authlist(struct nfs_parsed_mount_data *args,
+ struct nfs_mount_request *request)
+{
+ unsigned int i, j, server_authlist_len = *(request->auth_flav_len);
+
+ /*
+ * Certain releases of Linux's mountd return an empty
+ * flavor list. To prevent behavioral regression with
+ * these servers (ie. rejecting mounts that used to
+ * succeed), revert to pre-2.6.32 behavior (no checking)
+ * if the returned flavor list is empty.
+ */
+ if (server_authlist_len == 0)
+ return 0;
+
+ /*
+ * We avoid sophisticated negotiating here, as there are
+ * plenty of cases where we can get it wrong, providing
+ * either too little or too much security.
+ *
+ * RFC 2623, section 2.7 suggests we SHOULD prefer the
+ * flavor listed first. However, some servers list
+ * AUTH_NULL first. Our caller plants AUTH_SYS, the
+ * preferred default, in args->auth_flavors[0] if user
+ * didn't specify sec= mount option.
+ */
+ for (i = 0; i < args->auth_flavor_len; i++)
+ for (j = 0; j < server_authlist_len; j++)
+ if (args->auth_flavors[i] == request->auth_flavs[j]) {
+ dfprintk(MOUNT, "NFS: using auth flavor %d\n",
+ request->auth_flavs[j]);
+ args->auth_flavors[0] = request->auth_flavs[j];
+ return 0;
+ }
+
+ dfprintk(MOUNT, "NFS: server does not support requested auth flavor\n");
+ nfs_umount(request);
+ return -EACCES;
+}
+
+/*
* Use the remote server's MOUNT service to request the NFS file handle
* corresponding to the provided path.
*/
static int nfs_try_mount(struct nfs_parsed_mount_data *args,
struct nfs_fh *root_fh)
{
- unsigned int auth_flavor_len = 0;
+ rpc_authflavor_t server_authlist[NFS_MAX_SECFLAVORS];
+ unsigned int server_authlist_len = ARRAY_SIZE(server_authlist);
struct nfs_mount_request request = {
.sap = (struct sockaddr *)
&args->mount_server.address,
@@ -1459,7 +1424,8 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args,
.protocol = args->mount_server.protocol,
.fh = root_fh,
.noresvport = args->flags & NFS_MOUNT_NORESVPORT,
- .auth_flav_len = &auth_flavor_len,
+ .auth_flav_len = &server_authlist_len,
+ .auth_flavs = server_authlist,
};
int status;
@@ -1485,23 +1451,25 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args,
args->mount_server.addrlen = args->nfs_server.addrlen;
}
request.salen = args->mount_server.addrlen;
-
- /*
- * autobind will be used if mount_server.port == 0
- */
- nfs_set_port(request.sap, args->mount_server.port);
+ nfs_set_default_port(request.sap, args->mount_server.port, 0);
/*
* Now ask the mount server to map our export path
* to a file handle.
*/
status = nfs_mount(&request);
- if (status == 0)
- return 0;
+ if (status != 0) {
+ dfprintk(MOUNT, "NFS: unable to mount server %s, error %d\n",
+ request.hostname, status);
+ return status;
+ }
- dfprintk(MOUNT, "NFS: unable to mount server %s, error %d\n",
- request.hostname, status);
- return status;
+ /*
+ * MNTv1 (NFSv2) does not support auth flavor negotiation.
+ */
+ if (args->mount_server.version != NFS_MNT3_VERSION)
+ return 0;
+ return nfs_walk_authlist(args, &request);
}
static int nfs_parse_simple_hostname(const char *dev_name,
@@ -1661,6 +1629,7 @@ static int nfs_validate_mount_data(void *options,
const char *dev_name)
{
struct nfs_mount_data *data = (struct nfs_mount_data *)options;
+ struct sockaddr *sap = (struct sockaddr *)&args->nfs_server.address;
if (data == NULL)
goto out_no_data;
@@ -1672,10 +1641,12 @@ static int nfs_validate_mount_data(void *options,
args->acregmax = NFS_DEF_ACREGMAX;
args->acdirmin = NFS_DEF_ACDIRMIN;
args->acdirmax = NFS_DEF_ACDIRMAX;
- args->mount_server.port = 0; /* autobind unless user sets port */
- args->nfs_server.port = 0; /* autobind unless user sets port */
+ args->mount_server.port = NFS_UNSPEC_PORT;
+ args->nfs_server.port = NFS_UNSPEC_PORT;
args->nfs_server.protocol = XPRT_TRANSPORT_TCP;
args->auth_flavors[0] = RPC_AUTH_UNIX;
+ args->auth_flavor_len = 1;
+ args->minorversion = 0;
switch (data->version) {
case 1:
@@ -1697,8 +1668,11 @@ static int nfs_validate_mount_data(void *options,
if (data->root.size > NFS3_FHSIZE || data->root.size == 0)
goto out_invalid_fh;
mntfh->size = data->root.size;
- } else
+ args->version = 3;
+ } else {
mntfh->size = NFS2_FHSIZE;
+ args->version = 2;
+ }
memcpy(mntfh->data, data->root.data, mntfh->size);
@@ -1720,11 +1694,9 @@ static int nfs_validate_mount_data(void *options,
args->acdirmin = data->acdirmin;
args->acdirmax = data->acdirmax;
- memcpy(&args->nfs_server.address, &data->addr,
- sizeof(data->addr));
+ memcpy(sap, &data->addr, sizeof(data->addr));
args->nfs_server.addrlen = sizeof(data->addr);
- if (!nfs_verify_server_address((struct sockaddr *)
- &args->nfs_server.address))
+ if (!nfs_verify_server_address(sap))
goto out_no_address;
if (!(data->flags & NFS_MOUNT_TCP))
@@ -1772,12 +1744,18 @@ static int nfs_validate_mount_data(void *options,
if (nfs_parse_mount_options((char *)options, args) == 0)
return -EINVAL;
- if (!nfs_verify_server_address((struct sockaddr *)
- &args->nfs_server.address))
+ if (!nfs_verify_server_address(sap))
goto out_no_address;
- nfs_set_port((struct sockaddr *)&args->nfs_server.address,
- args->nfs_server.port);
+ if (args->version == 4)
+#ifdef CONFIG_NFS_V4
+ return nfs4_validate_text_mount_data(options,
+ args, dev_name);
+#else
+ goto out_v4_not_compiled;
+#endif
+
+ nfs_set_default_port(sap, args->nfs_server.port, 0);
nfs_set_mount_transport_protocol(args);
@@ -1825,6 +1803,12 @@ out_v3_not_compiled:
return -EPROTONOSUPPORT;
#endif /* !CONFIG_NFS_V3 */
+#ifndef CONFIG_NFS_V4
+out_v4_not_compiled:
+ dfprintk(MOUNT, "NFS: NFSv4 is not compiled into kernel\n");
+ return -EPROTONOSUPPORT;
+#endif /* !CONFIG_NFS_V4 */
+
out_nomem:
dfprintk(MOUNT, "NFS: not enough memory to handle mount options\n");
return -ENOMEM;
@@ -1934,6 +1918,8 @@ static inline void nfs_initialise_sb(struct super_block *sb)
if (server->flags & NFS_MOUNT_NOAC)
sb->s_flags |= MS_SYNCHRONOUS;
+ sb->s_bdi = &server->backing_dev_info;
+
nfs_super_set_maxbytes(sb, server->maxfilesize);
}
@@ -2120,6 +2106,14 @@ static int nfs_get_sb(struct file_system_type *fs_type,
if (error < 0)
goto out;
+#ifdef CONFIG_NFS_V4
+ if (data->version == 4) {
+ error = nfs4_try_mount(flags, dev_name, data, mnt);
+ kfree(data->client_address);
+ goto out;
+ }
+#endif /* CONFIG_NFS_V4 */
+
/* Get a volume representation */
server = nfs_create_server(data, mntfh);
if (IS_ERR(server)) {
@@ -2196,8 +2190,8 @@ static void nfs_kill_super(struct super_block *s)
{
struct nfs_server *server = NFS_SB(s);
- bdi_unregister(&server->backing_dev_info);
kill_anon_super(s);
+ bdi_unregister(&server->backing_dev_info);
nfs_fscache_release_super_cookie(s);
nfs_free_server(server);
}
@@ -2317,6 +2311,43 @@ static void nfs4_validate_mount_flags(struct nfs_parsed_mount_data *args)
args->flags &= ~(NFS_MOUNT_NONLM|NFS_MOUNT_NOACL|NFS_MOUNT_VER3);
}
+static int nfs4_validate_text_mount_data(void *options,
+ struct nfs_parsed_mount_data *args,
+ const char *dev_name)
+{
+ struct sockaddr *sap = (struct sockaddr *)&args->nfs_server.address;
+
+ nfs_set_default_port(sap, args->nfs_server.port, NFS_PORT);
+
+ nfs_validate_transport_protocol(args);
+
+ nfs4_validate_mount_flags(args);
+
+ if (args->version != 4) {
+ dfprintk(MOUNT,
+ "NFS4: Illegal mount version\n");
+ return -EINVAL;
+ }
+
+ if (args->auth_flavor_len > 1) {
+ dfprintk(MOUNT,
+ "NFS4: Too many RPC auth flavours specified\n");
+ return -EINVAL;
+ }
+
+ if (args->client_address == NULL) {
+ dfprintk(MOUNT,
+ "NFS4: mount program didn't pass callback address\n");
+ return -EINVAL;
+ }
+
+ return nfs_parse_devname(dev_name,
+ &args->nfs_server.hostname,
+ NFS4_MAXNAMLEN,
+ &args->nfs_server.export_path,
+ NFS4_MAXPATHLEN);
+}
+
/*
* Validate NFSv4 mount options
*/
@@ -2324,7 +2355,7 @@ static int nfs4_validate_mount_data(void *options,
struct nfs_parsed_mount_data *args,
const char *dev_name)
{
- struct sockaddr_in *ap;
+ struct sockaddr *sap = (struct sockaddr *)&args->nfs_server.address;
struct nfs4_mount_data *data = (struct nfs4_mount_data *)options;
char *c;
@@ -2337,23 +2368,22 @@ static int nfs4_validate_mount_data(void *options,
args->acregmax = NFS_DEF_ACREGMAX;
args->acdirmin = NFS_DEF_ACDIRMIN;
args->acdirmax = NFS_DEF_ACDIRMAX;
- args->nfs_server.port = NFS_PORT; /* 2049 unless user set port= */
+ args->nfs_server.port = NFS_UNSPEC_PORT;
args->auth_flavors[0] = RPC_AUTH_UNIX;
- args->auth_flavor_len = 0;
+ args->auth_flavor_len = 1;
+ args->version = 4;
args->minorversion = 0;
switch (data->version) {
case 1:
- ap = (struct sockaddr_in *)&args->nfs_server.address;
if (data->host_addrlen > sizeof(args->nfs_server.address))
goto out_no_address;
if (data->host_addrlen == 0)
goto out_no_address;
args->nfs_server.addrlen = data->host_addrlen;
- if (copy_from_user(ap, data->host_addr, data->host_addrlen))
+ if (copy_from_user(sap, data->host_addr, data->host_addrlen))
return -EFAULT;
- if (!nfs_verify_server_address((struct sockaddr *)
- &args->nfs_server.address))
+ if (!nfs_verify_server_address(sap))
goto out_no_address;
if (data->auth_flavourlen) {
@@ -2399,39 +2429,14 @@ static int nfs4_validate_mount_data(void *options,
nfs_validate_transport_protocol(args);
break;
- default: {
- int status;
-
+ default:
if (nfs_parse_mount_options((char *)options, args) == 0)
return -EINVAL;
- if (!nfs_verify_server_address((struct sockaddr *)
- &args->nfs_server.address))
+ if (!nfs_verify_server_address(sap))
return -EINVAL;
- nfs_set_port((struct sockaddr *)&args->nfs_server.address,
- args->nfs_server.port);
-
- nfs_validate_transport_protocol(args);
-
- nfs4_validate_mount_flags(args);
-
- if (args->auth_flavor_len > 1)
- goto out_inval_auth;
-
- if (args->client_address == NULL)
- goto out_no_client_address;
-
- status = nfs_parse_devname(dev_name,
- &args->nfs_server.hostname,
- NFS4_MAXNAMLEN,
- &args->nfs_server.export_path,
- NFS4_MAXPATHLEN);
- if (status < 0)
- return status;
-
- break;
- }
+ return nfs4_validate_text_mount_data(options, args, dev_name);
}
return 0;
@@ -2448,10 +2453,6 @@ out_inval_auth:
out_no_address:
dfprintk(MOUNT, "NFS4: mount program didn't pass remote address\n");
return -EINVAL;
-
-out_no_client_address:
- dfprintk(MOUNT, "NFS4: mount program didn't pass callback address\n");
- return -EINVAL;
}
/*
@@ -2618,6 +2619,34 @@ out_err:
return ret;
}
+static int nfs4_try_mount(int flags, const char *dev_name,
+ struct nfs_parsed_mount_data *data,
+ struct vfsmount *mnt)
+{
+ char *export_path;
+ struct vfsmount *root_mnt;
+ int error;
+
+ dfprintk(MOUNT, "--> nfs4_try_mount()\n");
+
+ export_path = data->nfs_server.export_path;
+ data->nfs_server.export_path = "/";
+ root_mnt = nfs_do_root_mount(&nfs4_remote_fs_type, flags, data,
+ data->nfs_server.hostname);
+ data->nfs_server.export_path = export_path;
+
+ error = PTR_ERR(root_mnt);
+ if (IS_ERR(root_mnt))
+ goto out;
+
+ error = nfs_follow_remote_path(root_mnt, export_path, mnt);
+
+out:
+ dfprintk(MOUNT, "<-- nfs4_try_mount() = %d%s\n", error,
+ error != 0 ? " [error]" : "");
+ return error;
+}
+
/*
* Get the superblock for an NFS4 mountpoint
*/
@@ -2625,8 +2654,6 @@ static int nfs4_get_sb(struct file_system_type *fs_type,
int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt)
{
struct nfs_parsed_mount_data *data;
- char *export_path;
- struct vfsmount *root_mnt;
int error = -ENOMEM;
data = kzalloc(sizeof(*data), GFP_KERNEL);
@@ -2638,17 +2665,7 @@ static int nfs4_get_sb(struct file_system_type *fs_type,
if (error < 0)
goto out;
- export_path = data->nfs_server.export_path;
- data->nfs_server.export_path = "/";
- root_mnt = nfs_do_root_mount(&nfs4_remote_fs_type, flags, data,
- data->nfs_server.hostname);
- data->nfs_server.export_path = export_path;
-
- error = PTR_ERR(root_mnt);
- if (IS_ERR(root_mnt))
- goto out;
-
- error = nfs_follow_remote_path(root_mnt, export_path, mnt);
+ error = nfs4_try_mount(flags, dev_name, data, mnt);
out:
kfree(data->client_address);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 0a0a2ff767c3..53eb26c16b50 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -13,6 +13,7 @@
#include <linux/file.h>
#include <linux/writeback.h>
#include <linux/swap.h>
+#include <linux/migrate.h>
#include <linux/sunrpc/clnt.h>
#include <linux/nfs_fs.h>
@@ -26,6 +27,7 @@
#include "internal.h"
#include "iostat.h"
#include "nfs4_fs.h"
+#include "fscache.h"
#define NFSDBG_FACILITY NFSDBG_PAGECACHE
@@ -87,17 +89,15 @@ struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount)
return p;
}
-static void nfs_writedata_free(struct nfs_write_data *p)
+void nfs_writedata_free(struct nfs_write_data *p)
{
if (p && (p->pagevec != &p->page_array[0]))
kfree(p->pagevec);
mempool_free(p, nfs_wdata_mempool);
}
-void nfs_writedata_release(void *data)
+static void nfs_writedata_release(struct nfs_write_data *wdata)
{
- struct nfs_write_data *wdata = data;
-
put_nfs_open_context(wdata->args.context);
nfs_writedata_free(wdata);
}
@@ -220,24 +220,17 @@ static void nfs_end_page_writeback(struct page *page)
clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC);
}
-/*
- * Find an associated nfs write request, and prepare to flush it out
- * May return an error if the user signalled nfs_wait_on_request().
- */
-static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
- struct page *page)
+static struct nfs_page *nfs_find_and_lock_request(struct page *page)
{
struct inode *inode = page->mapping->host;
struct nfs_page *req;
int ret;
spin_lock(&inode->i_lock);
- for(;;) {
+ for (;;) {
req = nfs_page_find_request_locked(page);
- if (req == NULL) {
- spin_unlock(&inode->i_lock);
- return 0;
- }
+ if (req == NULL)
+ break;
if (nfs_set_page_tag_locked(req))
break;
/* Note: If we hold the page lock, as is the case in nfs_writepage,
@@ -249,23 +242,40 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
ret = nfs_wait_on_request(req);
nfs_release_request(req);
if (ret != 0)
- return ret;
+ return ERR_PTR(ret);
spin_lock(&inode->i_lock);
}
- if (test_bit(PG_CLEAN, &req->wb_flags)) {
- spin_unlock(&inode->i_lock);
- BUG();
- }
- if (nfs_set_page_writeback(page) != 0) {
- spin_unlock(&inode->i_lock);
- BUG();
- }
spin_unlock(&inode->i_lock);
+ return req;
+}
+
+/*
+ * Find an associated nfs write request, and prepare to flush it out
+ * May return an error if the user signalled nfs_wait_on_request().
+ */
+static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
+ struct page *page)
+{
+ struct nfs_page *req;
+ int ret = 0;
+
+ req = nfs_find_and_lock_request(page);
+ if (!req)
+ goto out;
+ ret = PTR_ERR(req);
+ if (IS_ERR(req))
+ goto out;
+
+ ret = nfs_set_page_writeback(page);
+ BUG_ON(ret != 0);
+ BUG_ON(test_bit(PG_CLEAN, &req->wb_flags));
+
if (!nfs_pageio_add_request(pgio, req)) {
nfs_redirty_request(req);
- return pgio->pg_error;
+ ret = pgio->pg_error;
}
- return 0;
+out:
+ return ret;
}
static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, struct nfs_pageio_descriptor *pgio)
@@ -1480,7 +1490,6 @@ static int nfs_write_mapping(struct address_space *mapping, int how)
.nr_to_write = LONG_MAX,
.range_start = 0,
.range_end = LLONG_MAX,
- .for_writepages = 1,
};
return __nfs_write_mapping(mapping, &wbc, how);
@@ -1582,6 +1591,41 @@ int nfs_wb_page(struct inode *inode, struct page* page)
return nfs_wb_page_priority(inode, page, FLUSH_STABLE);
}
+#ifdef CONFIG_MIGRATION
+int nfs_migrate_page(struct address_space *mapping, struct page *newpage,
+ struct page *page)
+{
+ struct nfs_page *req;
+ int ret;
+
+ if (PageFsCache(page))
+ nfs_fscache_release_page(page, GFP_KERNEL);
+
+ req = nfs_find_and_lock_request(page);
+ ret = PTR_ERR(req);
+ if (IS_ERR(req))
+ goto out;
+
+ ret = migrate_page(mapping, newpage, page);
+ if (!req)
+ goto out;
+ if (ret)
+ goto out_unlock;
+ page_cache_get(newpage);
+ req->wb_page = newpage;
+ SetPagePrivate(newpage);
+ set_page_private(newpage, page_private(page));
+ ClearPagePrivate(page);
+ set_page_private(page, 0);
+ page_cache_release(page);
+out_unlock:
+ nfs_clear_page_tag_locked(req);
+ nfs_release_request(req);
+out:
+ return ret;
+}
+#endif
+
int __init nfs_init_writepagecache(void)
{
nfs_wdata_cachep = kmem_cache_create("nfs_write_data",
diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c
index 5573508f707f..36fcabbf5186 100644
--- a/fs/nfsd/auth.c
+++ b/fs/nfsd/auth.c
@@ -34,6 +34,8 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
int flags = nfsexp_flags(rqstp, exp);
int ret;
+ validate_process_creds();
+
/* discard any old override before preparing the new set */
revert_creds(get_cred(current->real_cred));
new = prepare_creds();
@@ -86,8 +88,10 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
else
new->cap_effective = cap_raise_nfsd_set(new->cap_effective,
new->cap_permitted);
+ validate_process_creds();
put_cred(override_creds(new));
put_cred(new);
+ validate_process_creds();
return 0;
oom:
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index b92a27629fb7..984a5ebcc1d6 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -85,6 +85,11 @@ static void expkey_request(struct cache_detail *cd,
(*bpp)[-1] = '\n';
}
+static int expkey_upcall(struct cache_detail *cd, struct cache_head *h)
+{
+ return sunrpc_cache_pipe_upcall(cd, h, expkey_request);
+}
+
static struct svc_expkey *svc_expkey_update(struct svc_expkey *new, struct svc_expkey *old);
static struct svc_expkey *svc_expkey_lookup(struct svc_expkey *);
static struct cache_detail svc_expkey_cache;
@@ -259,7 +264,7 @@ static struct cache_detail svc_expkey_cache = {
.hash_table = expkey_table,
.name = "nfsd.fh",
.cache_put = expkey_put,
- .cache_request = expkey_request,
+ .cache_upcall = expkey_upcall,
.cache_parse = expkey_parse,
.cache_show = expkey_show,
.match = expkey_match,
@@ -355,6 +360,11 @@ static void svc_export_request(struct cache_detail *cd,
(*bpp)[-1] = '\n';
}
+static int svc_export_upcall(struct cache_detail *cd, struct cache_head *h)
+{
+ return sunrpc_cache_pipe_upcall(cd, h, svc_export_request);
+}
+
static struct svc_export *svc_export_update(struct svc_export *new,
struct svc_export *old);
static struct svc_export *svc_export_lookup(struct svc_export *);
@@ -724,7 +734,7 @@ struct cache_detail svc_export_cache = {
.hash_table = export_table,
.name = "nfsd.export",
.cache_put = svc_export_put,
- .cache_request = svc_export_request,
+ .cache_upcall = svc_export_upcall,
.cache_parse = svc_export_parse,
.cache_show = svc_export_show,
.match = svc_export_match,
@@ -1331,6 +1341,8 @@ exp_pseudoroot(struct svc_rqst *rqstp, struct svc_fh *fhp)
if (rv)
goto out;
rv = check_nfsd_access(exp, rqstp);
+ if (rv)
+ fh_put(fhp);
out:
exp_put(exp);
return rv;
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 01d4ec1c88e0..edf926e1062f 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -814,17 +814,6 @@ encode_entry_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name,
return p;
}
-static __be32 *
-encode_entryplus_baggage(struct nfsd3_readdirres *cd, __be32 *p,
- struct svc_fh *fhp)
-{
- p = encode_post_op_attr(cd->rqstp, p, fhp);
- *p++ = xdr_one; /* yes, a file handle follows */
- p = encode_fh(p, fhp);
- fh_put(fhp);
- return p;
-}
-
static int
compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp,
const char *name, int namlen)
@@ -836,29 +825,54 @@ compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp,
dparent = cd->fh.fh_dentry;
exp = cd->fh.fh_export;
- fh_init(fhp, NFS3_FHSIZE);
if (isdotent(name, namlen)) {
if (namlen == 2) {
dchild = dget_parent(dparent);
if (dchild == dparent) {
/* filesystem root - cannot return filehandle for ".." */
dput(dchild);
- return 1;
+ return -ENOENT;
}
} else
dchild = dget(dparent);
} else
dchild = lookup_one_len(name, dparent, namlen);
if (IS_ERR(dchild))
- return 1;
- if (d_mountpoint(dchild) ||
- fh_compose(fhp, exp, dchild, &cd->fh) != 0 ||
- !dchild->d_inode)
- rv = 1;
+ return -ENOENT;
+ rv = -ENOENT;
+ if (d_mountpoint(dchild))
+ goto out;
+ rv = fh_compose(fhp, exp, dchild, &cd->fh);
+ if (rv)
+ goto out;
+ if (!dchild->d_inode)
+ goto out;
+ rv = 0;
+out:
dput(dchild);
return rv;
}
+__be32 *encode_entryplus_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name, int namlen)
+{
+ struct svc_fh fh;
+ int err;
+
+ fh_init(&fh, NFS3_FHSIZE);
+ err = compose_entry_fh(cd, &fh, name, namlen);
+ if (err) {
+ *p++ = 0;
+ *p++ = 0;
+ goto out;
+ }
+ p = encode_post_op_attr(cd->rqstp, p, &fh);
+ *p++ = xdr_one; /* yes, a file handle follows */
+ p = encode_fh(p, &fh);
+out:
+ fh_put(&fh);
+ return p;
+}
+
/*
* Encode a directory entry. This one works for both normal readdir
* and readdirplus.
@@ -929,16 +943,8 @@ encode_entry(struct readdir_cd *ccd, const char *name, int namlen,
p = encode_entry_baggage(cd, p, name, namlen, ino);
- /* throw in readdirplus baggage */
- if (plus) {
- struct svc_fh fh;
-
- if (compose_entry_fh(cd, &fh, name, namlen) > 0) {
- *p++ = 0;
- *p++ = 0;
- } else
- p = encode_entryplus_baggage(cd, p, &fh);
- }
+ if (plus)
+ p = encode_entryplus_baggage(cd, p, name, namlen);
num_entry_words = p - cd->buffer;
} else if (cd->rqstp->rq_respages[pn+1] != NULL) {
/* temporarily encode entry into next page, then move back to
@@ -951,17 +957,8 @@ encode_entry(struct readdir_cd *ccd, const char *name, int namlen,
p1 = encode_entry_baggage(cd, p1, name, namlen, ino);
- /* throw in readdirplus baggage */
- if (plus) {
- struct svc_fh fh;
-
- if (compose_entry_fh(cd, &fh, name, namlen) > 0) {
- /* zero out the filehandle */
- *p1++ = 0;
- *p1++ = 0;
- } else
- p1 = encode_entryplus_baggage(cd, p1, &fh);
- }
+ if (plus)
+ p = encode_entryplus_baggage(cd, p1, name, namlen);
/* determine entry word length and lengths to go in pages */
num_entry_words = p1 - tmp;
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index 54b8b4140c8f..725d02f210e2 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -321,7 +321,7 @@ _posix_to_nfsv4_one(struct posix_acl *pacl, struct nfs4_acl *acl,
deny = ~pas.group & pas.other;
if (deny) {
ace->type = NFS4_ACE_ACCESS_DENIED_ACE_TYPE;
- ace->flag = eflag | NFS4_ACE_IDENTIFIER_GROUP;
+ ace->flag = eflag;
ace->access_mask = deny_mask_from_posix(deny, flags);
ace->whotype = NFS4_ACL_WHO_GROUP;
ace++;
@@ -335,7 +335,7 @@ _posix_to_nfsv4_one(struct posix_acl *pacl, struct nfs4_acl *acl,
if (deny) {
ace->type = NFS4_ACE_ACCESS_DENIED_ACE_TYPE;
ace->flag = eflag | NFS4_ACE_IDENTIFIER_GROUP;
- ace->access_mask = mask_from_posix(deny, flags);
+ ace->access_mask = deny_mask_from_posix(deny, flags);
ace->whotype = NFS4_ACL_WHO_NAMED;
ace->who = pa->e_id;
ace++;
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 3fd23f7aceca..24e8d78f8dde 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -43,25 +43,30 @@
#include <linux/sunrpc/xdr.h>
#include <linux/sunrpc/svc.h>
#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/svcsock.h>
#include <linux/nfsd/nfsd.h>
#include <linux/nfsd/state.h>
#include <linux/sunrpc/sched.h>
#include <linux/nfs4.h>
+#include <linux/sunrpc/xprtsock.h>
#define NFSDDBG_FACILITY NFSDDBG_PROC
#define NFSPROC4_CB_NULL 0
#define NFSPROC4_CB_COMPOUND 1
+#define NFS4_STATEID_SIZE 16
/* Index of predefined Linux callback client operations */
enum {
- NFSPROC4_CLNT_CB_NULL = 0,
+ NFSPROC4_CLNT_CB_NULL = 0,
NFSPROC4_CLNT_CB_RECALL,
+ NFSPROC4_CLNT_CB_SEQUENCE,
};
enum nfs_cb_opnum4 {
OP_CB_RECALL = 4,
+ OP_CB_SEQUENCE = 11,
};
#define NFS4_MAXTAGLEN 20
@@ -70,17 +75,29 @@ enum nfs_cb_opnum4 {
#define NFS4_dec_cb_null_sz 0
#define cb_compound_enc_hdr_sz 4
#define cb_compound_dec_hdr_sz (3 + (NFS4_MAXTAGLEN >> 2))
+#define sessionid_sz (NFS4_MAX_SESSIONID_LEN >> 2)
+#define cb_sequence_enc_sz (sessionid_sz + 4 + \
+ 1 /* no referring calls list yet */)
+#define cb_sequence_dec_sz (op_dec_sz + sessionid_sz + 4)
+
#define op_enc_sz 1
#define op_dec_sz 2
#define enc_nfs4_fh_sz (1 + (NFS4_FHSIZE >> 2))
#define enc_stateid_sz (NFS4_STATEID_SIZE >> 2)
#define NFS4_enc_cb_recall_sz (cb_compound_enc_hdr_sz + \
+ cb_sequence_enc_sz + \
1 + enc_stateid_sz + \
enc_nfs4_fh_sz)
#define NFS4_dec_cb_recall_sz (cb_compound_dec_hdr_sz + \
+ cb_sequence_dec_sz + \
op_dec_sz)
+struct nfs4_rpc_args {
+ void *args_op;
+ struct nfsd4_cb_sequence args_seq;
+};
+
/*
* Generic encode routines from fs/nfs/nfs4xdr.c
*/
@@ -137,11 +154,13 @@ xdr_error: \
} while (0)
struct nfs4_cb_compound_hdr {
- int status;
- u32 ident;
+ /* args */
+ u32 ident; /* minorversion 0 only */
u32 nops;
__be32 *nops_p;
u32 minorversion;
+ /* res */
+ int status;
u32 taglen;
char *tag;
};
@@ -238,6 +257,27 @@ encode_cb_recall(struct xdr_stream *xdr, struct nfs4_delegation *dp,
hdr->nops++;
}
+static void
+encode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_cb_sequence *args,
+ struct nfs4_cb_compound_hdr *hdr)
+{
+ __be32 *p;
+
+ if (hdr->minorversion == 0)
+ return;
+
+ RESERVE_SPACE(1 + NFS4_MAX_SESSIONID_LEN + 20);
+
+ WRITE32(OP_CB_SEQUENCE);
+ WRITEMEM(args->cbs_clp->cl_sessionid.data, NFS4_MAX_SESSIONID_LEN);
+ WRITE32(args->cbs_clp->cl_cb_seq_nr);
+ WRITE32(0); /* slotid, always 0 */
+ WRITE32(0); /* highest slotid always 0 */
+ WRITE32(0); /* cachethis always 0 */
+ WRITE32(0); /* FIXME: support referring_call_lists */
+ hdr->nops++;
+}
+
static int
nfs4_xdr_enc_cb_null(struct rpc_rqst *req, __be32 *p)
{
@@ -249,15 +289,19 @@ nfs4_xdr_enc_cb_null(struct rpc_rqst *req, __be32 *p)
}
static int
-nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, __be32 *p, struct nfs4_delegation *args)
+nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, __be32 *p,
+ struct nfs4_rpc_args *rpc_args)
{
struct xdr_stream xdr;
+ struct nfs4_delegation *args = rpc_args->args_op;
struct nfs4_cb_compound_hdr hdr = {
.ident = args->dl_ident,
+ .minorversion = rpc_args->args_seq.cbs_minorversion,
};
xdr_init_encode(&xdr, &req->rq_snd_buf, p);
encode_cb_compound_hdr(&xdr, &hdr);
+ encode_cb_sequence(&xdr, &rpc_args->args_seq, &hdr);
encode_cb_recall(&xdr, args, &hdr);
encode_cb_nops(&hdr);
return 0;
@@ -299,6 +343,57 @@ decode_cb_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
return 0;
}
+/*
+ * Our current back channel implmentation supports a single backchannel
+ * with a single slot.
+ */
+static int
+decode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_cb_sequence *res,
+ struct rpc_rqst *rqstp)
+{
+ struct nfs4_sessionid id;
+ int status;
+ u32 dummy;
+ __be32 *p;
+
+ if (res->cbs_minorversion == 0)
+ return 0;
+
+ status = decode_cb_op_hdr(xdr, OP_CB_SEQUENCE);
+ if (status)
+ return status;
+
+ /*
+ * If the server returns different values for sessionID, slotID or
+ * sequence number, the server is looney tunes.
+ */
+ status = -ESERVERFAULT;
+
+ READ_BUF(NFS4_MAX_SESSIONID_LEN + 16);
+ memcpy(id.data, p, NFS4_MAX_SESSIONID_LEN);
+ p += XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN);
+ if (memcmp(id.data, res->cbs_clp->cl_sessionid.data,
+ NFS4_MAX_SESSIONID_LEN)) {
+ dprintk("%s Invalid session id\n", __func__);
+ goto out;
+ }
+ READ32(dummy);
+ if (dummy != res->cbs_clp->cl_cb_seq_nr) {
+ dprintk("%s Invalid sequence number\n", __func__);
+ goto out;
+ }
+ READ32(dummy); /* slotid must be 0 */
+ if (dummy != 0) {
+ dprintk("%s Invalid slotid\n", __func__);
+ goto out;
+ }
+ /* FIXME: process highest slotid and target highest slotid */
+ status = 0;
+out:
+ return status;
+}
+
+
static int
nfs4_xdr_dec_cb_null(struct rpc_rqst *req, __be32 *p)
{
@@ -306,7 +401,8 @@ nfs4_xdr_dec_cb_null(struct rpc_rqst *req, __be32 *p)
}
static int
-nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, __be32 *p)
+nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, __be32 *p,
+ struct nfsd4_cb_sequence *seq)
{
struct xdr_stream xdr;
struct nfs4_cb_compound_hdr hdr;
@@ -316,6 +412,11 @@ nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, __be32 *p)
status = decode_cb_compound_hdr(&xdr, &hdr);
if (status)
goto out;
+ if (seq) {
+ status = decode_cb_sequence(&xdr, seq, rqstp);
+ if (status)
+ goto out;
+ }
status = decode_cb_op_hdr(&xdr, OP_CB_RECALL);
out:
return status;
@@ -377,16 +478,15 @@ static int max_cb_time(void)
int setup_callback_client(struct nfs4_client *clp)
{
- struct sockaddr_in addr;
struct nfs4_cb_conn *cb = &clp->cl_cb_conn;
struct rpc_timeout timeparms = {
.to_initval = max_cb_time(),
.to_retries = 0,
};
struct rpc_create_args args = {
- .protocol = IPPROTO_TCP,
- .address = (struct sockaddr *)&addr,
- .addrsize = sizeof(addr),
+ .protocol = XPRT_TRANSPORT_TCP,
+ .address = (struct sockaddr *) &cb->cb_addr,
+ .addrsize = cb->cb_addrlen,
.timeout = &timeparms,
.program = &cb_program,
.prognumber = cb->cb_prog,
@@ -399,13 +499,10 @@ int setup_callback_client(struct nfs4_client *clp)
if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5))
return -EINVAL;
-
- /* Initialize address */
- memset(&addr, 0, sizeof(addr));
- addr.sin_family = AF_INET;
- addr.sin_port = htons(cb->cb_port);
- addr.sin_addr.s_addr = htonl(cb->cb_addr);
-
+ if (cb->cb_minorversion) {
+ args.bc_xprt = clp->cl_cb_xprt;
+ args.protocol = XPRT_TRANSPORT_BC_TCP;
+ }
/* Create RPC client */
client = rpc_create(&args);
if (IS_ERR(client)) {
@@ -439,42 +536,29 @@ static const struct rpc_call_ops nfsd4_cb_probe_ops = {
.rpc_call_done = nfsd4_cb_probe_done,
};
-static struct rpc_cred *lookup_cb_cred(struct nfs4_cb_conn *cb)
-{
- struct auth_cred acred = {
- .machine_cred = 1
- };
+static struct rpc_cred *callback_cred;
- /*
- * Note in the gss case this doesn't actually have to wait for a
- * gss upcall (or any calls to the client); this just creates a
- * non-uptodate cred which the rpc state machine will fill in with
- * a refresh_upcall later.
- */
- return rpcauth_lookup_credcache(cb->cb_client->cl_auth, &acred,
- RPCAUTH_LOOKUP_NEW);
+int set_callback_cred(void)
+{
+ callback_cred = rpc_lookup_machine_cred();
+ if (!callback_cred)
+ return -ENOMEM;
+ return 0;
}
+
void do_probe_callback(struct nfs4_client *clp)
{
struct nfs4_cb_conn *cb = &clp->cl_cb_conn;
struct rpc_message msg = {
.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL],
.rpc_argp = clp,
+ .rpc_cred = callback_cred
};
- struct rpc_cred *cred;
int status;
- cred = lookup_cb_cred(cb);
- if (IS_ERR(cred)) {
- status = PTR_ERR(cred);
- goto out;
- }
- cb->cb_cred = cred;
- msg.rpc_cred = cb->cb_cred;
status = rpc_call_async(cb->cb_client, &msg, RPC_TASK_SOFT,
&nfsd4_cb_probe_ops, (void *)clp);
-out:
if (status) {
warn_no_callback_path(clp, status);
put_nfs4_client(clp);
@@ -503,11 +587,95 @@ nfsd4_probe_callback(struct nfs4_client *clp)
do_probe_callback(clp);
}
+/*
+ * There's currently a single callback channel slot.
+ * If the slot is available, then mark it busy. Otherwise, set the
+ * thread for sleeping on the callback RPC wait queue.
+ */
+static int nfsd41_cb_setup_sequence(struct nfs4_client *clp,
+ struct rpc_task *task)
+{
+ struct nfs4_rpc_args *args = task->tk_msg.rpc_argp;
+ u32 *ptr = (u32 *)clp->cl_sessionid.data;
+ int status = 0;
+
+ dprintk("%s: %u:%u:%u:%u\n", __func__,
+ ptr[0], ptr[1], ptr[2], ptr[3]);
+
+ if (test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) {
+ rpc_sleep_on(&clp->cl_cb_waitq, task, NULL);
+ dprintk("%s slot is busy\n", __func__);
+ status = -EAGAIN;
+ goto out;
+ }
+
+ /*
+ * We'll need the clp during XDR encoding and decoding,
+ * and the sequence during decoding to verify the reply
+ */
+ args->args_seq.cbs_clp = clp;
+ task->tk_msg.rpc_resp = &args->args_seq;
+
+out:
+ dprintk("%s status=%d\n", __func__, status);
+ return status;
+}
+
+/*
+ * TODO: cb_sequence should support referring call lists, cachethis, multiple
+ * slots, and mark callback channel down on communication errors.
+ */
+static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
+{
+ struct nfs4_delegation *dp = calldata;
+ struct nfs4_client *clp = dp->dl_client;
+ struct nfs4_rpc_args *args = task->tk_msg.rpc_argp;
+ u32 minorversion = clp->cl_cb_conn.cb_minorversion;
+ int status = 0;
+
+ args->args_seq.cbs_minorversion = minorversion;
+ if (minorversion) {
+ status = nfsd41_cb_setup_sequence(clp, task);
+ if (status) {
+ if (status != -EAGAIN) {
+ /* terminate rpc task */
+ task->tk_status = status;
+ task->tk_action = NULL;
+ }
+ return;
+ }
+ }
+ rpc_call_start(task);
+}
+
+static void nfsd4_cb_done(struct rpc_task *task, void *calldata)
+{
+ struct nfs4_delegation *dp = calldata;
+ struct nfs4_client *clp = dp->dl_client;
+
+ dprintk("%s: minorversion=%d\n", __func__,
+ clp->cl_cb_conn.cb_minorversion);
+
+ if (clp->cl_cb_conn.cb_minorversion) {
+ /* No need for lock, access serialized in nfsd4_cb_prepare */
+ ++clp->cl_cb_seq_nr;
+ clear_bit(0, &clp->cl_cb_slot_busy);
+ rpc_wake_up_next(&clp->cl_cb_waitq);
+ dprintk("%s: freed slot, new seqid=%d\n", __func__,
+ clp->cl_cb_seq_nr);
+
+ /* We're done looking into the sequence information */
+ task->tk_msg.rpc_resp = NULL;
+ }
+}
+
static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
{
struct nfs4_delegation *dp = calldata;
struct nfs4_client *clp = dp->dl_client;
+ nfsd4_cb_done(task, calldata);
+
switch (task->tk_status) {
case -EIO:
/* Network partition? */
@@ -520,16 +688,19 @@ static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
break;
default:
/* success, or error we can't handle */
- return;
+ goto done;
}
if (dp->dl_retries--) {
rpc_delay(task, 2*HZ);
task->tk_status = 0;
rpc_restart_call(task);
+ return;
} else {
atomic_set(&clp->cl_cb_conn.cb_set, 0);
warn_no_callback_path(clp, task->tk_status);
}
+done:
+ kfree(task->tk_msg.rpc_argp);
}
static void nfsd4_cb_recall_release(void *calldata)
@@ -542,6 +713,7 @@ static void nfsd4_cb_recall_release(void *calldata)
}
static const struct rpc_call_ops nfsd4_cb_recall_ops = {
+ .rpc_call_prepare = nfsd4_cb_prepare,
.rpc_call_done = nfsd4_cb_recall_done,
.rpc_release = nfsd4_cb_recall_release,
};
@@ -554,17 +726,24 @@ nfsd4_cb_recall(struct nfs4_delegation *dp)
{
struct nfs4_client *clp = dp->dl_client;
struct rpc_clnt *clnt = clp->cl_cb_conn.cb_client;
+ struct nfs4_rpc_args *args;
struct rpc_message msg = {
.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL],
- .rpc_argp = dp,
- .rpc_cred = clp->cl_cb_conn.cb_cred
+ .rpc_cred = callback_cred
};
- int status;
+ int status = -ENOMEM;
+ args = kzalloc(sizeof(*args), GFP_KERNEL);
+ if (!args)
+ goto out;
+ args->args_op = dp;
+ msg.rpc_argp = args;
dp->dl_retries = 1;
status = rpc_call_async(clnt, &msg, RPC_TASK_SOFT,
&nfsd4_cb_recall_ops, dp);
+out:
if (status) {
+ kfree(args);
put_nfs4_client(clp);
nfs4_put_delegation(dp);
}
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index 5b398421b051..cdfa86fa1471 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -146,6 +146,12 @@ idtoname_request(struct cache_detail *cd, struct cache_head *ch, char **bpp,
}
static int
+idtoname_upcall(struct cache_detail *cd, struct cache_head *ch)
+{
+ return sunrpc_cache_pipe_upcall(cd, ch, idtoname_request);
+}
+
+static int
idtoname_match(struct cache_head *ca, struct cache_head *cb)
{
struct ent *a = container_of(ca, struct ent, h);
@@ -175,10 +181,10 @@ idtoname_show(struct seq_file *m, struct cache_detail *cd, struct cache_head *h)
}
static void
-warn_no_idmapd(struct cache_detail *detail)
+warn_no_idmapd(struct cache_detail *detail, int has_died)
{
printk("nfsd: nfsv4 idmapping failing: has idmapd %s?\n",
- detail->last_close? "died" : "not been started");
+ has_died ? "died" : "not been started");
}
@@ -192,7 +198,7 @@ static struct cache_detail idtoname_cache = {
.hash_table = idtoname_table,
.name = "nfs4.idtoname",
.cache_put = ent_put,
- .cache_request = idtoname_request,
+ .cache_upcall = idtoname_upcall,
.cache_parse = idtoname_parse,
.cache_show = idtoname_show,
.warn_no_listener = warn_no_idmapd,
@@ -325,6 +331,12 @@ nametoid_request(struct cache_detail *cd, struct cache_head *ch, char **bpp,
}
static int
+nametoid_upcall(struct cache_detail *cd, struct cache_head *ch)
+{
+ return sunrpc_cache_pipe_upcall(cd, ch, nametoid_request);
+}
+
+static int
nametoid_match(struct cache_head *ca, struct cache_head *cb)
{
struct ent *a = container_of(ca, struct ent, h);
@@ -363,7 +375,7 @@ static struct cache_detail nametoid_cache = {
.hash_table = nametoid_table,
.name = "nfs4.nametoid",
.cache_put = ent_put,
- .cache_request = nametoid_request,
+ .cache_upcall = nametoid_upcall,
.cache_parse = nametoid_parse,
.cache_show = nametoid_show,
.warn_no_listener = warn_no_idmapd,
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 7c8801769a3c..bebc0c2e1b0a 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -68,7 +68,6 @@ check_attr_support(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
u32 *bmval, u32 *writable)
{
struct dentry *dentry = cstate->current_fh.fh_dentry;
- struct svc_export *exp = cstate->current_fh.fh_export;
/*
* Check about attributes are supported by the NFSv4 server or not.
@@ -80,17 +79,13 @@ check_attr_support(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
return nfserr_attrnotsupp;
/*
- * Check FATTR4_WORD0_ACL & FATTR4_WORD0_FS_LOCATIONS can be supported
+ * Check FATTR4_WORD0_ACL can be supported
* in current environment or not.
*/
if (bmval[0] & FATTR4_WORD0_ACL) {
if (!IS_POSIXACL(dentry->d_inode))
return nfserr_attrnotsupp;
}
- if (bmval[0] & FATTR4_WORD0_FS_LOCATIONS) {
- if (exp->ex_fslocs.locations == NULL)
- return nfserr_attrnotsupp;
- }
/*
* According to spec, read-only attributes return ERR_INVAL.
@@ -123,6 +118,35 @@ nfsd4_check_open_attributes(struct svc_rqst *rqstp,
return status;
}
+static int
+is_create_with_attrs(struct nfsd4_open *open)
+{
+ return open->op_create == NFS4_OPEN_CREATE
+ && (open->op_createmode == NFS4_CREATE_UNCHECKED
+ || open->op_createmode == NFS4_CREATE_GUARDED
+ || open->op_createmode == NFS4_CREATE_EXCLUSIVE4_1);
+}
+
+/*
+ * if error occurs when setting the acl, just clear the acl bit
+ * in the returned attr bitmap.
+ */
+static void
+do_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ struct nfs4_acl *acl, u32 *bmval)
+{
+ __be32 status;
+
+ status = nfsd4_set_nfs4_acl(rqstp, fhp, acl);
+ if (status)
+ /*
+ * We should probably fail the whole open at this point,
+ * but we've already created the file, so it's too late;
+ * So this seems the least of evils:
+ */
+ bmval[0] &= ~FATTR4_WORD0_ACL;
+}
+
static inline void
fh_dup2(struct svc_fh *dst, struct svc_fh *src)
{
@@ -206,6 +230,9 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
if (status)
goto out;
+ if (is_create_with_attrs(open) && open->op_acl != NULL)
+ do_set_nfs4_acl(rqstp, &resfh, open->op_acl, open->op_bmval);
+
set_change_info(&open->op_cinfo, current_fh);
fh_dup2(current_fh, &resfh);
@@ -536,12 +563,17 @@ nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
status = nfserr_badtype;
}
- if (!status) {
- fh_unlock(&cstate->current_fh);
- set_change_info(&create->cr_cinfo, &cstate->current_fh);
- fh_dup2(&cstate->current_fh, &resfh);
- }
+ if (status)
+ goto out;
+
+ if (create->cr_acl != NULL)
+ do_set_nfs4_acl(rqstp, &resfh, create->cr_acl,
+ create->cr_bmval);
+ fh_unlock(&cstate->current_fh);
+ set_change_info(&create->cr_cinfo, &cstate->current_fh);
+ fh_dup2(&cstate->current_fh, &resfh);
+out:
fh_put(&resfh);
return status;
}
@@ -947,34 +979,6 @@ static struct nfsd4_operation nfsd4_ops[];
static const char *nfsd4_op_name(unsigned opnum);
/*
- * This is a replay of a compound for which no cache entry pages
- * were used. Encode the sequence operation, and if cachethis is FALSE
- * encode the uncache rep error on the next operation.
- */
-static __be32
-nfsd4_enc_uncached_replay(struct nfsd4_compoundargs *args,
- struct nfsd4_compoundres *resp)
-{
- struct nfsd4_op *op;
-
- dprintk("--> %s resp->opcnt %d ce_cachethis %u \n", __func__,
- resp->opcnt, resp->cstate.slot->sl_cache_entry.ce_cachethis);
-
- /* Encode the replayed sequence operation */
- BUG_ON(resp->opcnt != 1);
- op = &args->ops[resp->opcnt - 1];
- nfsd4_encode_operation(resp, op);
-
- /*return nfserr_retry_uncached_rep in next operation. */
- if (resp->cstate.slot->sl_cache_entry.ce_cachethis == 0) {
- op = &args->ops[resp->opcnt++];
- op->status = nfserr_retry_uncached_rep;
- nfsd4_encode_operation(resp, op);
- }
- return op->status;
-}
-
-/*
* Enforce NFSv4.1 COMPOUND ordering rules.
*
* TODO:
@@ -1083,13 +1087,10 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
BUG_ON(op->status == nfs_ok);
encode_op:
- /* Only from SEQUENCE or CREATE_SESSION */
+ /* Only from SEQUENCE */
if (resp->cstate.status == nfserr_replay_cache) {
dprintk("%s NFS4.1 replay from cache\n", __func__);
- if (nfsd4_not_cached(resp))
- status = nfsd4_enc_uncached_replay(args, resp);
- else
- status = op->status;
+ status = op->status;
goto out;
}
if (op->status == nfserr_replay_me) {
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 980a216a48c8..2153f9bdbebd 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -55,6 +55,7 @@
#include <linux/lockd/bind.h>
#include <linux/module.h>
#include <linux/sunrpc/svcauth_gss.h>
+#include <linux/sunrpc/clnt.h>
#define NFSDDBG_FACILITY NFSDDBG_PROC
@@ -413,36 +414,65 @@ gen_sessionid(struct nfsd4_session *ses)
}
/*
- * Give the client the number of slots it requests bound by
- * NFSD_MAX_SLOTS_PER_SESSION and by sv_drc_max_pages.
+ * The protocol defines ca_maxresponssize_cached to include the size of
+ * the rpc header, but all we need to cache is the data starting after
+ * the end of the initial SEQUENCE operation--the rest we regenerate
+ * each time. Therefore we can advertise a ca_maxresponssize_cached
+ * value that is the number of bytes in our cache plus a few additional
+ * bytes. In order to stay on the safe side, and not promise more than
+ * we can cache, those additional bytes must be the minimum possible: 24
+ * bytes of rpc header (xid through accept state, with AUTH_NULL
+ * verifier), 12 for the compound header (with zero-length tag), and 44
+ * for the SEQUENCE op response:
+ */
+#define NFSD_MIN_HDR_SEQ_SZ (24 + 12 + 44)
+
+/*
+ * Give the client the number of ca_maxresponsesize_cached slots it
+ * requests, of size bounded by NFSD_SLOT_CACHE_SIZE,
+ * NFSD_MAX_MEM_PER_SESSION, and nfsd_drc_max_mem. Do not allow more
+ * than NFSD_MAX_SLOTS_PER_SESSION.
*
- * If we run out of pages (sv_drc_pages_used == sv_drc_max_pages) we
- * should (up to a point) re-negotiate active sessions and reduce their
- * slot usage to make rooom for new connections. For now we just fail the
- * create session.
+ * If we run out of reserved DRC memory we should (up to a point)
+ * re-negotiate active sessions and reduce their slot usage to make
+ * rooom for new connections. For now we just fail the create session.
*/
-static int set_forechannel_maxreqs(struct nfsd4_channel_attrs *fchan)
+static int set_forechannel_drc_size(struct nfsd4_channel_attrs *fchan)
{
- int status = 0, np = fchan->maxreqs * NFSD_PAGES_PER_SLOT;
+ int mem, size = fchan->maxresp_cached;
if (fchan->maxreqs < 1)
return nfserr_inval;
- else if (fchan->maxreqs > NFSD_MAX_SLOTS_PER_SESSION)
- fchan->maxreqs = NFSD_MAX_SLOTS_PER_SESSION;
- spin_lock(&nfsd_serv->sv_lock);
- if (np + nfsd_serv->sv_drc_pages_used > nfsd_serv->sv_drc_max_pages)
- np = nfsd_serv->sv_drc_max_pages - nfsd_serv->sv_drc_pages_used;
- nfsd_serv->sv_drc_pages_used += np;
- spin_unlock(&nfsd_serv->sv_lock);
+ if (size < NFSD_MIN_HDR_SEQ_SZ)
+ size = NFSD_MIN_HDR_SEQ_SZ;
+ size -= NFSD_MIN_HDR_SEQ_SZ;
+ if (size > NFSD_SLOT_CACHE_SIZE)
+ size = NFSD_SLOT_CACHE_SIZE;
+
+ /* bound the maxreqs by NFSD_MAX_MEM_PER_SESSION */
+ mem = fchan->maxreqs * size;
+ if (mem > NFSD_MAX_MEM_PER_SESSION) {
+ fchan->maxreqs = NFSD_MAX_MEM_PER_SESSION / size;
+ if (fchan->maxreqs > NFSD_MAX_SLOTS_PER_SESSION)
+ fchan->maxreqs = NFSD_MAX_SLOTS_PER_SESSION;
+ mem = fchan->maxreqs * size;
+ }
- if (np <= 0) {
- status = nfserr_resource;
- fchan->maxreqs = 0;
- } else
- fchan->maxreqs = np / NFSD_PAGES_PER_SLOT;
+ spin_lock(&nfsd_drc_lock);
+ /* bound the total session drc memory ussage */
+ if (mem + nfsd_drc_mem_used > nfsd_drc_max_mem) {
+ fchan->maxreqs = (nfsd_drc_max_mem - nfsd_drc_mem_used) / size;
+ mem = fchan->maxreqs * size;
+ }
+ nfsd_drc_mem_used += mem;
+ spin_unlock(&nfsd_drc_lock);
- return status;
+ if (fchan->maxreqs == 0)
+ return nfserr_serverfault;
+
+ fchan->maxresp_cached = size + NFSD_MIN_HDR_SEQ_SZ;
+ return 0;
}
/*
@@ -466,36 +496,41 @@ static int init_forechannel_attrs(struct svc_rqst *rqstp,
fchan->maxresp_sz = maxcount;
session_fchan->maxresp_sz = fchan->maxresp_sz;
- /* Set the max response cached size our default which is
- * a multiple of PAGE_SIZE and small */
- session_fchan->maxresp_cached = NFSD_PAGES_PER_SLOT * PAGE_SIZE;
- fchan->maxresp_cached = session_fchan->maxresp_cached;
-
/* Use the client's maxops if possible */
if (fchan->maxops > NFSD_MAX_OPS_PER_COMPOUND)
fchan->maxops = NFSD_MAX_OPS_PER_COMPOUND;
session_fchan->maxops = fchan->maxops;
- /* try to use the client requested number of slots */
- if (fchan->maxreqs > NFSD_MAX_SLOTS_PER_SESSION)
- fchan->maxreqs = NFSD_MAX_SLOTS_PER_SESSION;
-
/* FIXME: Error means no more DRC pages so the server should
* recover pages from existing sessions. For now fail session
* creation.
*/
- status = set_forechannel_maxreqs(fchan);
+ status = set_forechannel_drc_size(fchan);
+ session_fchan->maxresp_cached = fchan->maxresp_cached;
session_fchan->maxreqs = fchan->maxreqs;
+
+ dprintk("%s status %d\n", __func__, status);
return status;
}
+static void
+free_session_slots(struct nfsd4_session *ses)
+{
+ int i;
+
+ for (i = 0; i < ses->se_fchannel.maxreqs; i++)
+ kfree(ses->se_slots[i]);
+}
+
static int
alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp,
struct nfsd4_create_session *cses)
{
struct nfsd4_session *new, tmp;
- int idx, status = nfserr_resource, slotsize;
+ struct nfsd4_slot *sp;
+ int idx, slotsize, cachesize, i;
+ int status;
memset(&tmp, 0, sizeof(tmp));
@@ -506,14 +541,27 @@ alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp,
if (status)
goto out;
- /* allocate struct nfsd4_session and slot table in one piece */
- slotsize = tmp.se_fchannel.maxreqs * sizeof(struct nfsd4_slot);
+ BUILD_BUG_ON(NFSD_MAX_SLOTS_PER_SESSION * sizeof(struct nfsd4_slot)
+ + sizeof(struct nfsd4_session) > PAGE_SIZE);
+
+ status = nfserr_serverfault;
+ /* allocate struct nfsd4_session and slot table pointers in one piece */
+ slotsize = tmp.se_fchannel.maxreqs * sizeof(struct nfsd4_slot *);
new = kzalloc(sizeof(*new) + slotsize, GFP_KERNEL);
if (!new)
goto out;
memcpy(new, &tmp, sizeof(*new));
+ /* allocate each struct nfsd4_slot and data cache in one piece */
+ cachesize = new->se_fchannel.maxresp_cached - NFSD_MIN_HDR_SEQ_SZ;
+ for (i = 0; i < new->se_fchannel.maxreqs; i++) {
+ sp = kzalloc(sizeof(*sp) + cachesize, GFP_KERNEL);
+ if (!sp)
+ goto out_free;
+ new->se_slots[i] = sp;
+ }
+
new->se_client = clp;
gen_sessionid(new);
idx = hash_sessionid(&new->se_sessionid);
@@ -530,6 +578,10 @@ alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp,
status = nfs_ok;
out:
return status;
+out_free:
+ free_session_slots(new);
+ kfree(new);
+ goto out;
}
/* caller must hold sessionid_lock */
@@ -572,19 +624,16 @@ release_session(struct nfsd4_session *ses)
nfsd4_put_session(ses);
}
-static void nfsd4_release_respages(struct page **respages, short resused);
-
void
free_session(struct kref *kref)
{
struct nfsd4_session *ses;
- int i;
ses = container_of(kref, struct nfsd4_session, se_ref);
- for (i = 0; i < ses->se_fchannel.maxreqs; i++) {
- struct nfsd4_cache_entry *e = &ses->se_slots[i].sl_cache_entry;
- nfsd4_release_respages(e->ce_respages, e->ce_resused);
- }
+ spin_lock(&nfsd_drc_lock);
+ nfsd_drc_mem_used -= ses->se_fchannel.maxreqs * NFSD_SLOT_CACHE_SIZE;
+ spin_unlock(&nfsd_drc_lock);
+ free_session_slots(ses);
kfree(ses);
}
@@ -647,18 +696,14 @@ shutdown_callback_client(struct nfs4_client *clp)
clp->cl_cb_conn.cb_client = NULL;
rpc_shutdown_client(clnt);
}
- if (clp->cl_cb_conn.cb_cred) {
- put_rpccred(clp->cl_cb_conn.cb_cred);
- clp->cl_cb_conn.cb_cred = NULL;
- }
}
static inline void
free_client(struct nfs4_client *clp)
{
shutdown_callback_client(clp);
- nfsd4_release_respages(clp->cl_slot.sl_cache_entry.ce_respages,
- clp->cl_slot.sl_cache_entry.ce_resused);
+ if (clp->cl_cb_xprt)
+ svc_xprt_put(clp->cl_cb_xprt);
if (clp->cl_cred.cr_group_info)
put_group_info(clp->cl_cred.cr_group_info);
kfree(clp->cl_principal);
@@ -714,25 +759,6 @@ expire_client(struct nfs4_client *clp)
put_nfs4_client(clp);
}
-static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir)
-{
- struct nfs4_client *clp;
-
- clp = alloc_client(name);
- if (clp == NULL)
- return NULL;
- memcpy(clp->cl_recdir, recdir, HEXDIR_LEN);
- atomic_set(&clp->cl_count, 1);
- atomic_set(&clp->cl_cb_conn.cb_set, 0);
- INIT_LIST_HEAD(&clp->cl_idhash);
- INIT_LIST_HEAD(&clp->cl_strhash);
- INIT_LIST_HEAD(&clp->cl_openowners);
- INIT_LIST_HEAD(&clp->cl_delegations);
- INIT_LIST_HEAD(&clp->cl_sessions);
- INIT_LIST_HEAD(&clp->cl_lru);
- return clp;
-}
-
static void copy_verf(struct nfs4_client *target, nfs4_verifier *source)
{
memcpy(target->cl_verifier.data, source->data,
@@ -795,6 +821,46 @@ static void gen_confirm(struct nfs4_client *clp)
*p++ = i++;
}
+static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
+ struct svc_rqst *rqstp, nfs4_verifier *verf)
+{
+ struct nfs4_client *clp;
+ struct sockaddr *sa = svc_addr(rqstp);
+ char *princ;
+
+ clp = alloc_client(name);
+ if (clp == NULL)
+ return NULL;
+
+ princ = svc_gss_principal(rqstp);
+ if (princ) {
+ clp->cl_principal = kstrdup(princ, GFP_KERNEL);
+ if (clp->cl_principal == NULL) {
+ free_client(clp);
+ return NULL;
+ }
+ }
+
+ memcpy(clp->cl_recdir, recdir, HEXDIR_LEN);
+ atomic_set(&clp->cl_count, 1);
+ atomic_set(&clp->cl_cb_conn.cb_set, 0);
+ INIT_LIST_HEAD(&clp->cl_idhash);
+ INIT_LIST_HEAD(&clp->cl_strhash);
+ INIT_LIST_HEAD(&clp->cl_openowners);
+ INIT_LIST_HEAD(&clp->cl_delegations);
+ INIT_LIST_HEAD(&clp->cl_sessions);
+ INIT_LIST_HEAD(&clp->cl_lru);
+ clear_bit(0, &clp->cl_cb_slot_busy);
+ rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table");
+ copy_verf(clp, verf);
+ rpc_copy_addr((struct sockaddr *) &clp->cl_addr, sa);
+ clp->cl_flavor = rqstp->rq_flavor;
+ copy_cred(&clp->cl_cred, &rqstp->rq_cred);
+ gen_confirm(clp);
+
+ return clp;
+}
+
static int check_name(struct xdr_netobj name)
{
if (name.len == 0)
@@ -902,93 +968,40 @@ find_unconfirmed_client_by_str(const char *dname, unsigned int hashval,
return NULL;
}
-/* a helper function for parse_callback */
-static int
-parse_octet(unsigned int *lenp, char **addrp)
-{
- unsigned int len = *lenp;
- char *p = *addrp;
- int n = -1;
- char c;
-
- for (;;) {
- if (!len)
- break;
- len--;
- c = *p++;
- if (c == '.')
- break;
- if ((c < '0') || (c > '9')) {
- n = -1;
- break;
- }
- if (n < 0)
- n = 0;
- n = (n * 10) + (c - '0');
- if (n > 255) {
- n = -1;
- break;
- }
- }
- *lenp = len;
- *addrp = p;
- return n;
-}
-
-/* parse and set the setclientid ipv4 callback address */
-static int
-parse_ipv4(unsigned int addr_len, char *addr_val, unsigned int *cbaddrp, unsigned short *cbportp)
-{
- int temp = 0;
- u32 cbaddr = 0;
- u16 cbport = 0;
- u32 addrlen = addr_len;
- char *addr = addr_val;
- int i, shift;
-
- /* ipaddress */
- shift = 24;
- for(i = 4; i > 0 ; i--) {
- if ((temp = parse_octet(&addrlen, &addr)) < 0) {
- return 0;
- }
- cbaddr |= (temp << shift);
- if (shift > 0)
- shift -= 8;
- }
- *cbaddrp = cbaddr;
-
- /* port */
- shift = 8;
- for(i = 2; i > 0 ; i--) {
- if ((temp = parse_octet(&addrlen, &addr)) < 0) {
- return 0;
- }
- cbport |= (temp << shift);
- if (shift > 0)
- shift -= 8;
- }
- *cbportp = cbport;
- return 1;
-}
-
static void
-gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se)
+gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, u32 scopeid)
{
struct nfs4_cb_conn *cb = &clp->cl_cb_conn;
-
- /* Currently, we only support tcp for the callback channel */
- if ((se->se_callback_netid_len != 3) || memcmp((char *)se->se_callback_netid_val, "tcp", 3))
+ unsigned short expected_family;
+
+ /* Currently, we only support tcp and tcp6 for the callback channel */
+ if (se->se_callback_netid_len == 3 &&
+ !memcmp(se->se_callback_netid_val, "tcp", 3))
+ expected_family = AF_INET;
+ else if (se->se_callback_netid_len == 4 &&
+ !memcmp(se->se_callback_netid_val, "tcp6", 4))
+ expected_family = AF_INET6;
+ else
goto out_err;
- if ( !(parse_ipv4(se->se_callback_addr_len, se->se_callback_addr_val,
- &cb->cb_addr, &cb->cb_port)))
+ cb->cb_addrlen = rpc_uaddr2sockaddr(se->se_callback_addr_val,
+ se->se_callback_addr_len,
+ (struct sockaddr *) &cb->cb_addr,
+ sizeof(cb->cb_addr));
+
+ if (!cb->cb_addrlen || cb->cb_addr.ss_family != expected_family)
goto out_err;
+
+ if (cb->cb_addr.ss_family == AF_INET6)
+ ((struct sockaddr_in6 *) &cb->cb_addr)->sin6_scope_id = scopeid;
+
cb->cb_minorversion = 0;
cb->cb_prog = se->se_callback_prog;
cb->cb_ident = se->se_callback_ident;
return;
out_err:
+ cb->cb_addr.ss_family = AF_UNSPEC;
+ cb->cb_addrlen = 0;
dprintk(KERN_INFO "NFSD: this client (clientid %08x/%08x) "
"will not receive delegations\n",
clp->cl_clientid.cl_boot, clp->cl_clientid.cl_id);
@@ -996,175 +1009,87 @@ out_err:
return;
}
-void
-nfsd4_set_statp(struct svc_rqst *rqstp, __be32 *statp)
-{
- struct nfsd4_compoundres *resp = rqstp->rq_resp;
-
- resp->cstate.statp = statp;
-}
-
/*
- * Dereference the result pages.
+ * Cache a reply. nfsd4_check_drc_limit() has bounded the cache size.
*/
-static void
-nfsd4_release_respages(struct page **respages, short resused)
+void
+nfsd4_store_cache_entry(struct nfsd4_compoundres *resp)
{
- int i;
+ struct nfsd4_slot *slot = resp->cstate.slot;
+ unsigned int base;
- dprintk("--> %s\n", __func__);
- for (i = 0; i < resused; i++) {
- if (!respages[i])
- continue;
- put_page(respages[i]);
- respages[i] = NULL;
- }
-}
+ dprintk("--> %s slot %p\n", __func__, slot);
-static void
-nfsd4_copy_pages(struct page **topages, struct page **frompages, short count)
-{
- int i;
+ slot->sl_opcnt = resp->opcnt;
+ slot->sl_status = resp->cstate.status;
- for (i = 0; i < count; i++) {
- topages[i] = frompages[i];
- if (!topages[i])
- continue;
- get_page(topages[i]);
+ if (nfsd4_not_cached(resp)) {
+ slot->sl_datalen = 0;
+ return;
}
+ slot->sl_datalen = (char *)resp->p - (char *)resp->cstate.datap;
+ base = (char *)resp->cstate.datap -
+ (char *)resp->xbuf->head[0].iov_base;
+ if (read_bytes_from_xdr_buf(resp->xbuf, base, slot->sl_data,
+ slot->sl_datalen))
+ WARN("%s: sessions DRC could not cache compound\n", __func__);
+ return;
}
/*
- * Cache the reply pages up to NFSD_PAGES_PER_SLOT + 1, clearing the previous
- * pages. We add a page to NFSD_PAGES_PER_SLOT for the case where the total
- * length of the XDR response is less than se_fmaxresp_cached
- * (NFSD_PAGES_PER_SLOT * PAGE_SIZE) but the xdr_buf pages is used for a
- * of the reply (e.g. readdir).
+ * Encode the replay sequence operation from the slot values.
+ * If cachethis is FALSE encode the uncached rep error on the next
+ * operation which sets resp->p and increments resp->opcnt for
+ * nfs4svc_encode_compoundres.
*
- * Store the base and length of the rq_req.head[0] page
- * of the NFSv4.1 data, just past the rpc header.
*/
-void
-nfsd4_store_cache_entry(struct nfsd4_compoundres *resp)
+static __be32
+nfsd4_enc_sequence_replay(struct nfsd4_compoundargs *args,
+ struct nfsd4_compoundres *resp)
{
- struct nfsd4_cache_entry *entry = &resp->cstate.slot->sl_cache_entry;
- struct svc_rqst *rqstp = resp->rqstp;
- struct nfsd4_compoundargs *args = rqstp->rq_argp;
- struct nfsd4_op *op = &args->ops[resp->opcnt];
- struct kvec *resv = &rqstp->rq_res.head[0];
-
- dprintk("--> %s entry %p\n", __func__, entry);
-
- /* Don't cache a failed OP_SEQUENCE. */
- if (resp->opcnt == 1 && op->opnum == OP_SEQUENCE && resp->cstate.status)
- return;
+ struct nfsd4_op *op;
+ struct nfsd4_slot *slot = resp->cstate.slot;
- nfsd4_release_respages(entry->ce_respages, entry->ce_resused);
- entry->ce_opcnt = resp->opcnt;
- entry->ce_status = resp->cstate.status;
+ dprintk("--> %s resp->opcnt %d cachethis %u \n", __func__,
+ resp->opcnt, resp->cstate.slot->sl_cachethis);
- /*
- * Don't need a page to cache just the sequence operation - the slot
- * does this for us!
- */
+ /* Encode the replayed sequence operation */
+ op = &args->ops[resp->opcnt - 1];
+ nfsd4_encode_operation(resp, op);
- if (nfsd4_not_cached(resp)) {
- entry->ce_resused = 0;
- entry->ce_rpchdrlen = 0;
- dprintk("%s Just cache SEQUENCE. ce_cachethis %d\n", __func__,
- resp->cstate.slot->sl_cache_entry.ce_cachethis);
- return;
- }
- entry->ce_resused = rqstp->rq_resused;
- if (entry->ce_resused > NFSD_PAGES_PER_SLOT + 1)
- entry->ce_resused = NFSD_PAGES_PER_SLOT + 1;
- nfsd4_copy_pages(entry->ce_respages, rqstp->rq_respages,
- entry->ce_resused);
- entry->ce_datav.iov_base = resp->cstate.statp;
- entry->ce_datav.iov_len = resv->iov_len - ((char *)resp->cstate.statp -
- (char *)page_address(rqstp->rq_respages[0]));
- /* Current request rpc header length*/
- entry->ce_rpchdrlen = (char *)resp->cstate.statp -
- (char *)page_address(rqstp->rq_respages[0]);
-}
-
-/*
- * We keep the rpc header, but take the nfs reply from the replycache.
- */
-static int
-nfsd41_copy_replay_data(struct nfsd4_compoundres *resp,
- struct nfsd4_cache_entry *entry)
-{
- struct svc_rqst *rqstp = resp->rqstp;
- struct kvec *resv = &resp->rqstp->rq_res.head[0];
- int len;
-
- /* Current request rpc header length*/
- len = (char *)resp->cstate.statp -
- (char *)page_address(rqstp->rq_respages[0]);
- if (entry->ce_datav.iov_len + len > PAGE_SIZE) {
- dprintk("%s v41 cached reply too large (%Zd).\n", __func__,
- entry->ce_datav.iov_len);
- return 0;
+ /* Return nfserr_retry_uncached_rep in next operation. */
+ if (args->opcnt > 1 && slot->sl_cachethis == 0) {
+ op = &args->ops[resp->opcnt++];
+ op->status = nfserr_retry_uncached_rep;
+ nfsd4_encode_operation(resp, op);
}
- /* copy the cached reply nfsd data past the current rpc header */
- memcpy((char *)resv->iov_base + len, entry->ce_datav.iov_base,
- entry->ce_datav.iov_len);
- resv->iov_len = len + entry->ce_datav.iov_len;
- return 1;
+ return op->status;
}
/*
- * Keep the first page of the replay. Copy the NFSv4.1 data from the first
- * cached page. Replace any futher replay pages from the cache.
+ * The sequence operation is not cached because we can use the slot and
+ * session values.
*/
__be32
nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp,
struct nfsd4_sequence *seq)
{
- struct nfsd4_cache_entry *entry = &resp->cstate.slot->sl_cache_entry;
+ struct nfsd4_slot *slot = resp->cstate.slot;
__be32 status;
- dprintk("--> %s entry %p\n", __func__, entry);
-
- /*
- * If this is just the sequence operation, we did not keep
- * a page in the cache entry because we can just use the
- * slot info stored in struct nfsd4_sequence that was checked
- * against the slot in nfsd4_sequence().
- *
- * This occurs when seq->cachethis is FALSE, or when the client
- * session inactivity timer fires and a solo sequence operation
- * is sent (lease renewal).
- */
- if (seq && nfsd4_not_cached(resp)) {
- seq->maxslots = resp->cstate.session->se_fchannel.maxreqs;
- return nfs_ok;
- }
-
- if (!nfsd41_copy_replay_data(resp, entry)) {
- /*
- * Not enough room to use the replay rpc header, send the
- * cached header. Release all the allocated result pages.
- */
- svc_free_res_pages(resp->rqstp);
- nfsd4_copy_pages(resp->rqstp->rq_respages, entry->ce_respages,
- entry->ce_resused);
- } else {
- /* Release all but the first allocated result page */
+ dprintk("--> %s slot %p\n", __func__, slot);
- resp->rqstp->rq_resused--;
- svc_free_res_pages(resp->rqstp);
+ /* Either returns 0 or nfserr_retry_uncached */
+ status = nfsd4_enc_sequence_replay(resp->rqstp->rq_argp, resp);
+ if (status == nfserr_retry_uncached_rep)
+ return status;
- nfsd4_copy_pages(&resp->rqstp->rq_respages[1],
- &entry->ce_respages[1],
- entry->ce_resused - 1);
- }
+ /* The sequence operation has been encoded, cstate->datap set. */
+ memcpy(resp->cstate.datap, slot->sl_data, slot->sl_datalen);
- resp->rqstp->rq_resused = entry->ce_resused;
- resp->opcnt = entry->ce_opcnt;
- resp->cstate.iovlen = entry->ce_datav.iov_len + entry->ce_rpchdrlen;
- status = entry->ce_status;
+ resp->opcnt = slot->sl_opcnt;
+ resp->p = resp->cstate.datap + XDR_QUADLEN(slot->sl_datalen);
+ status = slot->sl_status;
return status;
}
@@ -1194,13 +1119,15 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
int status;
unsigned int strhashval;
char dname[HEXDIR_LEN];
+ char addr_str[INET6_ADDRSTRLEN];
nfs4_verifier verf = exid->verifier;
- u32 ip_addr = svc_addr_in(rqstp)->sin_addr.s_addr;
+ struct sockaddr *sa = svc_addr(rqstp);
+ rpc_ntop(sa, addr_str, sizeof(addr_str));
dprintk("%s rqstp=%p exid=%p clname.len=%u clname.data=%p "
- " ip_addr=%u flags %x, spa_how %d\n",
+ "ip_addr=%s flags %x, spa_how %d\n",
__func__, rqstp, exid, exid->clname.len, exid->clname.data,
- ip_addr, exid->flags, exid->spa_how);
+ addr_str, exid->flags, exid->spa_how);
if (!check_name(exid->clname) || (exid->flags & ~EXCHGID4_FLAG_MASK_A))
return nfserr_inval;
@@ -1281,28 +1208,23 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
out_new:
/* Normal case */
- new = create_client(exid->clname, dname);
+ new = create_client(exid->clname, dname, rqstp, &verf);
if (new == NULL) {
- status = nfserr_resource;
+ status = nfserr_serverfault;
goto out;
}
- copy_verf(new, &verf);
- copy_cred(&new->cl_cred, &rqstp->rq_cred);
- new->cl_addr = ip_addr;
gen_clid(new);
- gen_confirm(new);
add_to_unconfirmed(new, strhashval);
out_copy:
exid->clientid.cl_boot = new->cl_clientid.cl_boot;
exid->clientid.cl_id = new->cl_clientid.cl_id;
- new->cl_slot.sl_seqid = 0;
exid->seqid = 1;
nfsd4_set_ex_flags(new, exid);
dprintk("nfsd4_exchange_id seqid %d flags %x\n",
- new->cl_slot.sl_seqid, new->cl_exchange_flags);
+ new->cl_cs_slot.sl_seqid, new->cl_exchange_flags);
status = nfs_ok;
out:
@@ -1313,40 +1235,60 @@ error:
}
static int
-check_slot_seqid(u32 seqid, struct nfsd4_slot *slot)
+check_slot_seqid(u32 seqid, u32 slot_seqid, int slot_inuse)
{
- dprintk("%s enter. seqid %d slot->sl_seqid %d\n", __func__, seqid,
- slot->sl_seqid);
+ dprintk("%s enter. seqid %d slot_seqid %d\n", __func__, seqid,
+ slot_seqid);
/* The slot is in use, and no response has been sent. */
- if (slot->sl_inuse) {
- if (seqid == slot->sl_seqid)
+ if (slot_inuse) {
+ if (seqid == slot_seqid)
return nfserr_jukebox;
else
return nfserr_seq_misordered;
}
/* Normal */
- if (likely(seqid == slot->sl_seqid + 1))
+ if (likely(seqid == slot_seqid + 1))
return nfs_ok;
/* Replay */
- if (seqid == slot->sl_seqid)
+ if (seqid == slot_seqid)
return nfserr_replay_cache;
/* Wraparound */
- if (seqid == 1 && (slot->sl_seqid + 1) == 0)
+ if (seqid == 1 && (slot_seqid + 1) == 0)
return nfs_ok;
/* Misordered replay or misordered new request */
return nfserr_seq_misordered;
}
+/*
+ * Cache the create session result into the create session single DRC
+ * slot cache by saving the xdr structure. sl_seqid has been set.
+ * Do this for solo or embedded create session operations.
+ */
+static void
+nfsd4_cache_create_session(struct nfsd4_create_session *cr_ses,
+ struct nfsd4_clid_slot *slot, int nfserr)
+{
+ slot->sl_status = nfserr;
+ memcpy(&slot->sl_cr_ses, cr_ses, sizeof(*cr_ses));
+}
+
+static __be32
+nfsd4_replay_create_session(struct nfsd4_create_session *cr_ses,
+ struct nfsd4_clid_slot *slot)
+{
+ memcpy(cr_ses, &slot->sl_cr_ses, sizeof(*cr_ses));
+ return slot->sl_status;
+}
+
__be32
nfsd4_create_session(struct svc_rqst *rqstp,
struct nfsd4_compound_state *cstate,
struct nfsd4_create_session *cr_ses)
{
- u32 ip_addr = svc_addr_in(rqstp)->sin_addr.s_addr;
- struct nfsd4_compoundres *resp = rqstp->rq_resp;
+ struct sockaddr *sa = svc_addr(rqstp);
struct nfs4_client *conf, *unconf;
- struct nfsd4_slot *slot = NULL;
+ struct nfsd4_clid_slot *cs_slot = NULL;
int status = 0;
nfs4_lock_state();
@@ -1354,40 +1296,38 @@ nfsd4_create_session(struct svc_rqst *rqstp,
conf = find_confirmed_client(&cr_ses->clientid);
if (conf) {
- slot = &conf->cl_slot;
- status = check_slot_seqid(cr_ses->seqid, slot);
+ cs_slot = &conf->cl_cs_slot;
+ status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0);
if (status == nfserr_replay_cache) {
dprintk("Got a create_session replay! seqid= %d\n",
- slot->sl_seqid);
- cstate->slot = slot;
- cstate->status = status;
+ cs_slot->sl_seqid);
/* Return the cached reply status */
- status = nfsd4_replay_cache_entry(resp, NULL);
+ status = nfsd4_replay_create_session(cr_ses, cs_slot);
goto out;
- } else if (cr_ses->seqid != conf->cl_slot.sl_seqid + 1) {
+ } else if (cr_ses->seqid != cs_slot->sl_seqid + 1) {
status = nfserr_seq_misordered;
dprintk("Sequence misordered!\n");
dprintk("Expected seqid= %d but got seqid= %d\n",
- slot->sl_seqid, cr_ses->seqid);
+ cs_slot->sl_seqid, cr_ses->seqid);
goto out;
}
- conf->cl_slot.sl_seqid++;
+ cs_slot->sl_seqid++;
} else if (unconf) {
if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred) ||
- (ip_addr != unconf->cl_addr)) {
+ !rpc_cmp_addr(sa, (struct sockaddr *) &unconf->cl_addr)) {
status = nfserr_clid_inuse;
goto out;
}
- slot = &unconf->cl_slot;
- status = check_slot_seqid(cr_ses->seqid, slot);
+ cs_slot = &unconf->cl_cs_slot;
+ status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0);
if (status) {
/* an unconfirmed replay returns misordered */
status = nfserr_seq_misordered;
- goto out;
+ goto out_cache;
}
- slot->sl_seqid++; /* from 0 to 1 */
+ cs_slot->sl_seqid++; /* from 0 to 1 */
move_to_confirmed(unconf);
/*
@@ -1396,6 +1336,19 @@ nfsd4_create_session(struct svc_rqst *rqstp,
cr_ses->flags &= ~SESSION4_PERSIST;
cr_ses->flags &= ~SESSION4_RDMA;
+ if (cr_ses->flags & SESSION4_BACK_CHAN) {
+ unconf->cl_cb_xprt = rqstp->rq_xprt;
+ svc_xprt_get(unconf->cl_cb_xprt);
+ rpc_copy_addr(
+ (struct sockaddr *)&unconf->cl_cb_conn.cb_addr,
+ sa);
+ unconf->cl_cb_conn.cb_addrlen = svc_addr_len(sa);
+ unconf->cl_cb_conn.cb_minorversion =
+ cstate->minorversion;
+ unconf->cl_cb_conn.cb_prog = cr_ses->callback_prog;
+ unconf->cl_cb_seq_nr = 1;
+ nfsd4_probe_callback(unconf);
+ }
conf = unconf;
} else {
status = nfserr_stale_clientid;
@@ -1408,12 +1361,11 @@ nfsd4_create_session(struct svc_rqst *rqstp,
memcpy(cr_ses->sessionid.data, conf->cl_sessionid.data,
NFS4_MAX_SESSIONID_LEN);
- cr_ses->seqid = slot->sl_seqid;
+ cr_ses->seqid = cs_slot->sl_seqid;
- slot->sl_inuse = true;
- cstate->slot = slot;
- /* Ensure a page is used for the cache */
- slot->sl_cache_entry.ce_cachethis = 1;
+out_cache:
+ /* cache solo and embedded create sessions under the state lock */
+ nfsd4_cache_create_session(cr_ses, cs_slot, status);
out:
nfs4_unlock_state();
dprintk("%s returns %d\n", __func__, ntohl(status));
@@ -1478,18 +1430,23 @@ nfsd4_sequence(struct svc_rqst *rqstp,
if (seq->slotid >= session->se_fchannel.maxreqs)
goto out;
- slot = &session->se_slots[seq->slotid];
+ slot = session->se_slots[seq->slotid];
dprintk("%s: slotid %d\n", __func__, seq->slotid);
- status = check_slot_seqid(seq->seqid, slot);
+ /* We do not negotiate the number of slots yet, so set the
+ * maxslots to the session maxreqs which is used to encode
+ * sr_highest_slotid and the sr_target_slot id to maxslots */
+ seq->maxslots = session->se_fchannel.maxreqs;
+
+ status = check_slot_seqid(seq->seqid, slot->sl_seqid, slot->sl_inuse);
if (status == nfserr_replay_cache) {
cstate->slot = slot;
cstate->session = session;
/* Return the cached reply status and set cstate->status
- * for nfsd4_svc_encode_compoundres processing */
+ * for nfsd4_proc_compound processing */
status = nfsd4_replay_cache_entry(resp, seq);
cstate->status = nfserr_replay_cache;
- goto replay_cache;
+ goto out;
}
if (status)
goto out;
@@ -1497,23 +1454,23 @@ nfsd4_sequence(struct svc_rqst *rqstp,
/* Success! bump slot seqid */
slot->sl_inuse = true;
slot->sl_seqid = seq->seqid;
- slot->sl_cache_entry.ce_cachethis = seq->cachethis;
- /* Always set the cache entry cachethis for solo sequence */
- if (nfsd4_is_solo_sequence(resp))
- slot->sl_cache_entry.ce_cachethis = 1;
+ slot->sl_cachethis = seq->cachethis;
cstate->slot = slot;
cstate->session = session;
-replay_cache:
- /* Renew the clientid on success and on replay.
- * Hold a session reference until done processing the compound:
+ /* Hold a session reference until done processing the compound:
* nfsd4_put_session called only if the cstate slot is set.
*/
- renew_client(session->se_client);
nfsd4_get_session(session);
out:
spin_unlock(&sessionid_lock);
+ /* Renew the clientid on success and on replay */
+ if (cstate->session) {
+ nfs4_lock_state();
+ renew_client(session->se_client);
+ nfs4_unlock_state();
+ }
dprintk("%s: return %d\n", __func__, ntohl(status));
return status;
}
@@ -1522,7 +1479,7 @@ __be32
nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
struct nfsd4_setclientid *setclid)
{
- struct sockaddr_in *sin = svc_addr_in(rqstp);
+ struct sockaddr *sa = svc_addr(rqstp);
struct xdr_netobj clname = {
.len = setclid->se_namelen,
.data = setclid->se_name,
@@ -1531,7 +1488,6 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
unsigned int strhashval;
struct nfs4_client *conf, *unconf, *new;
__be32 status;
- char *princ;
char dname[HEXDIR_LEN];
if (!check_name(clname))
@@ -1554,8 +1510,11 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
/* RFC 3530 14.2.33 CASE 0: */
status = nfserr_clid_inuse;
if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)) {
- dprintk("NFSD: setclientid: string in use by client"
- " at %pI4\n", &conf->cl_addr);
+ char addr_str[INET6_ADDRSTRLEN];
+ rpc_ntop((struct sockaddr *) &conf->cl_addr, addr_str,
+ sizeof(addr_str));
+ dprintk("NFSD: setclientid: string in use by client "
+ "at %s\n", addr_str);
goto out;
}
}
@@ -1573,7 +1532,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
*/
if (unconf)
expire_client(unconf);
- new = create_client(clname, dname);
+ new = create_client(clname, dname, rqstp, &clverifier);
if (new == NULL)
goto out;
gen_clid(new);
@@ -1590,7 +1549,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
*/
expire_client(unconf);
}
- new = create_client(clname, dname);
+ new = create_client(clname, dname, rqstp, &clverifier);
if (new == NULL)
goto out;
copy_clid(new, conf);
@@ -1600,7 +1559,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
* probable client reboot; state will be removed if
* confirmed.
*/
- new = create_client(clname, dname);
+ new = create_client(clname, dname, rqstp, &clverifier);
if (new == NULL)
goto out;
gen_clid(new);
@@ -1611,25 +1570,12 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
* confirmed.
*/
expire_client(unconf);
- new = create_client(clname, dname);
+ new = create_client(clname, dname, rqstp, &clverifier);
if (new == NULL)
goto out;
gen_clid(new);
}
- copy_verf(new, &clverifier);
- new->cl_addr = sin->sin_addr.s_addr;
- new->cl_flavor = rqstp->rq_flavor;
- princ = svc_gss_principal(rqstp);
- if (princ) {
- new->cl_principal = kstrdup(princ, GFP_KERNEL);
- if (new->cl_principal == NULL) {
- free_client(new);
- goto out;
- }
- }
- copy_cred(&new->cl_cred, &rqstp->rq_cred);
- gen_confirm(new);
- gen_callback(new, setclid);
+ gen_callback(new, setclid, rpc_get_scope_id(sa));
add_to_unconfirmed(new, strhashval);
setclid->se_clientid.cl_boot = new->cl_clientid.cl_boot;
setclid->se_clientid.cl_id = new->cl_clientid.cl_id;
@@ -1651,7 +1597,7 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
struct nfsd4_compound_state *cstate,
struct nfsd4_setclientid_confirm *setclientid_confirm)
{
- struct sockaddr_in *sin = svc_addr_in(rqstp);
+ struct sockaddr *sa = svc_addr(rqstp);
struct nfs4_client *conf, *unconf;
nfs4_verifier confirm = setclientid_confirm->sc_confirm;
clientid_t * clid = &setclientid_confirm->sc_clientid;
@@ -1670,9 +1616,9 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
unconf = find_unconfirmed_client(clid);
status = nfserr_clid_inuse;
- if (conf && conf->cl_addr != sin->sin_addr.s_addr)
+ if (conf && !rpc_cmp_addr((struct sockaddr *) &conf->cl_addr, sa))
goto out;
- if (unconf && unconf->cl_addr != sin->sin_addr.s_addr)
+ if (unconf && !rpc_cmp_addr((struct sockaddr *) &unconf->cl_addr, sa))
goto out;
/*
@@ -2163,7 +2109,7 @@ int nfsd_change_deleg_cb(struct file_lock **onlist, int arg)
return -EAGAIN;
}
-static struct lock_manager_operations nfsd_lease_mng_ops = {
+static const struct lock_manager_operations nfsd_lease_mng_ops = {
.fl_break = nfsd_break_deleg_cb,
.fl_release_private = nfsd_release_deleg_cb,
.fl_copy_lock = nfsd_copy_lock_deleg_cb,
@@ -3368,7 +3314,7 @@ nfs4_transform_lock_offset(struct file_lock *lock)
/* Hack!: For now, we're defining this just so we can use a pointer to it
* as a unique cookie to identify our (NFSv4's) posix locks. */
-static struct lock_manager_operations nfsd_posix_mng_ops = {
+static const struct lock_manager_operations nfsd_posix_mng_ops = {
};
static inline void
@@ -4072,7 +4018,7 @@ set_max_delegations(void)
/* initialization to perform when the nfsd service is started: */
-static void
+static int
__nfs4_state_start(void)
{
unsigned long grace_time;
@@ -4084,19 +4030,26 @@ __nfs4_state_start(void)
printk(KERN_INFO "NFSD: starting %ld-second grace period\n",
grace_time/HZ);
laundry_wq = create_singlethread_workqueue("nfsd4");
+ if (laundry_wq == NULL)
+ return -ENOMEM;
queue_delayed_work(laundry_wq, &laundromat_work, grace_time);
set_max_delegations();
+ return set_callback_cred();
}
-void
+int
nfs4_state_start(void)
{
+ int ret;
+
if (nfs4_init)
- return;
+ return 0;
nfsd4_load_reboot_recovery_data();
- __nfs4_state_start();
+ ret = __nfs4_state_start();
+ if (ret)
+ return ret;
nfs4_init = 1;
- return;
+ return 0;
}
time_t
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 2dcc7feaa6ff..0fbd50cee1f6 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1599,7 +1599,8 @@ static __be32 nfsd4_encode_fs_location4(struct nfsd4_fs_location *location,
static char *nfsd4_path(struct svc_rqst *rqstp, struct svc_export *exp, __be32 *stat)
{
struct svc_fh tmp_fh;
- char *path, *rootpath;
+ char *path = NULL, *rootpath;
+ size_t rootlen;
fh_init(&tmp_fh, NFS4_FHSIZE);
*stat = exp_pseudoroot(rqstp, &tmp_fh);
@@ -1609,14 +1610,18 @@ static char *nfsd4_path(struct svc_rqst *rqstp, struct svc_export *exp, __be32 *
path = exp->ex_pathname;
- if (strncmp(path, rootpath, strlen(rootpath))) {
+ rootlen = strlen(rootpath);
+ if (strncmp(path, rootpath, rootlen)) {
dprintk("nfsd: fs_locations failed;"
"%s is not contained in %s\n", path, rootpath);
*stat = nfserr_notsupp;
- return NULL;
+ path = NULL;
+ goto out;
}
-
- return path + strlen(rootpath);
+ path += rootlen;
+out:
+ fh_put(&tmp_fh);
+ return path;
}
/*
@@ -1793,11 +1798,6 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
goto out_nfserr;
}
}
- if (bmval0 & FATTR4_WORD0_FS_LOCATIONS) {
- if (exp->ex_fslocs.locations == NULL) {
- bmval0 &= ~FATTR4_WORD0_FS_LOCATIONS;
- }
- }
if ((buflen -= 16) < 0)
goto out_resource;
@@ -1825,8 +1825,6 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
goto out_resource;
if (!aclsupport)
word0 &= ~FATTR4_WORD0_ACL;
- if (!exp->ex_fslocs.locations)
- word0 &= ~FATTR4_WORD0_FS_LOCATIONS;
if (!word2) {
WRITE32(2);
WRITE32(word0);
@@ -3064,6 +3062,7 @@ nfsd4_encode_sequence(struct nfsd4_compoundres *resp, int nfserr,
WRITE32(0);
ADJUST_ARGS();
+ resp->cstate.datap = p; /* DRC cache data pointer */
return 0;
}
@@ -3166,7 +3165,7 @@ static int nfsd4_check_drc_limit(struct nfsd4_compoundres *resp)
return status;
session = resp->cstate.session;
- if (session == NULL || slot->sl_cache_entry.ce_cachethis == 0)
+ if (session == NULL || slot->sl_cachethis == 0)
return status;
if (resp->opcnt >= args->opcnt)
@@ -3291,6 +3290,7 @@ nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compo
/*
* All that remains is to write the tag and operation count...
*/
+ struct nfsd4_compound_state *cs = &resp->cstate;
struct kvec *iov;
p = resp->tagp;
*p++ = htonl(resp->taglen);
@@ -3304,17 +3304,11 @@ nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compo
iov = &rqstp->rq_res.head[0];
iov->iov_len = ((char*)resp->p) - (char*)iov->iov_base;
BUG_ON(iov->iov_len > PAGE_SIZE);
- if (nfsd4_has_session(&resp->cstate)) {
- if (resp->cstate.status == nfserr_replay_cache &&
- !nfsd4_not_cached(resp)) {
- iov->iov_len = resp->cstate.iovlen;
- } else {
- nfsd4_store_cache_entry(resp);
- dprintk("%s: SET SLOT STATE TO AVAILABLE\n", __func__);
- resp->cstate.slot->sl_inuse = 0;
- }
- if (resp->cstate.session)
- nfsd4_put_session(resp->cstate.session);
+ if (nfsd4_has_session(cs) && cs->status != nfserr_replay_cache) {
+ nfsd4_store_cache_entry(resp);
+ dprintk("%s: SET SLOT STATE TO AVAILABLE\n", __func__);
+ resp->cstate.slot->sl_inuse = false;
+ nfsd4_put_session(resp->cstate.session);
}
return 1;
}
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 6d0847562d87..00388d2a3c99 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -37,6 +37,7 @@
#include <linux/nfsd/xdr.h>
#include <linux/nfsd/syscall.h>
#include <linux/lockd/lockd.h>
+#include <linux/sunrpc/clnt.h>
#include <asm/uaccess.h>
#include <net/ipv6.h>
@@ -173,12 +174,13 @@ static const struct file_operations exports_operations = {
};
extern int nfsd_pool_stats_open(struct inode *inode, struct file *file);
+extern int nfsd_pool_stats_release(struct inode *inode, struct file *file);
static struct file_operations pool_stats_operations = {
.open = nfsd_pool_stats_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = seq_release,
+ .release = nfsd_pool_stats_release,
.owner = THIS_MODULE,
};
@@ -490,22 +492,18 @@ static ssize_t write_getfd(struct file *file, char *buf, size_t size)
*
* Input:
* buf: '\n'-terminated C string containing a
- * presentation format IPv4 address
+ * presentation format IP address
* size: length of C string in @buf
* Output:
* On success: returns zero if all specified locks were released;
* returns one if one or more locks were not released
* On error: return code is negative errno value
- *
- * Note: Only AF_INET client addresses are passed in
*/
static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size)
{
- struct sockaddr_in sin = {
- .sin_family = AF_INET,
- };
- int b1, b2, b3, b4;
- char c;
+ struct sockaddr_storage address;
+ struct sockaddr *sap = (struct sockaddr *)&address;
+ size_t salen = sizeof(address);
char *fo_path;
/* sanity check */
@@ -519,14 +517,10 @@ static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size)
if (qword_get(&buf, fo_path, size) < 0)
return -EINVAL;
- /* get ipv4 address */
- if (sscanf(fo_path, "%u.%u.%u.%u%c", &b1, &b2, &b3, &b4, &c) != 4)
- return -EINVAL;
- if (b1 > 255 || b2 > 255 || b3 > 255 || b4 > 255)
+ if (rpc_pton(fo_path, size, sap, salen) == 0)
return -EINVAL;
- sin.sin_addr.s_addr = htonl((b1 << 24) | (b2 << 16) | (b3 << 8) | b4);
- return nlmsvc_unlock_all_by_ip((struct sockaddr *)&sin);
+ return nlmsvc_unlock_all_by_ip(sap);
}
/**
@@ -783,10 +777,7 @@ static ssize_t write_pool_threads(struct file *file, char *buf, size_t size)
size -= len;
mesg += len;
}
-
- mutex_unlock(&nfsd_mutex);
- return (mesg-buf);
-
+ rv = mesg - buf;
out_free:
kfree(nthreads);
mutex_unlock(&nfsd_mutex);
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 8847f3fbfc1e..01965b2f3a76 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -397,44 +397,51 @@ static inline void _fh_update_old(struct dentry *dentry,
fh->ofh_dirino = 0;
}
-__be32
-fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
- struct svc_fh *ref_fh)
+static bool is_root_export(struct svc_export *exp)
{
- /* ref_fh is a reference file handle.
- * if it is non-null and for the same filesystem, then we should compose
- * a filehandle which is of the same version, where possible.
- * Currently, that means that if ref_fh->fh_handle.fh_version == 0xca
- * Then create a 32byte filehandle using nfs_fhbase_old
- *
- */
+ return exp->ex_path.dentry == exp->ex_path.dentry->d_sb->s_root;
+}
- u8 version;
- u8 fsid_type = 0;
- struct inode * inode = dentry->d_inode;
- struct dentry *parent = dentry->d_parent;
- __u32 *datap;
- dev_t ex_dev = exp->ex_path.dentry->d_inode->i_sb->s_dev;
- int root_export = (exp->ex_path.dentry == exp->ex_path.dentry->d_sb->s_root);
+static struct super_block *exp_sb(struct svc_export *exp)
+{
+ return exp->ex_path.dentry->d_inode->i_sb;
+}
- dprintk("nfsd: fh_compose(exp %02x:%02x/%ld %s/%s, ino=%ld)\n",
- MAJOR(ex_dev), MINOR(ex_dev),
- (long) exp->ex_path.dentry->d_inode->i_ino,
- parent->d_name.name, dentry->d_name.name,
- (inode ? inode->i_ino : 0));
+static bool fsid_type_ok_for_exp(u8 fsid_type, struct svc_export *exp)
+{
+ switch (fsid_type) {
+ case FSID_DEV:
+ if (!old_valid_dev(exp_sb(exp)->s_dev))
+ return 0;
+ /* FALL THROUGH */
+ case FSID_MAJOR_MINOR:
+ case FSID_ENCODE_DEV:
+ return exp_sb(exp)->s_type->fs_flags & FS_REQUIRES_DEV;
+ case FSID_NUM:
+ return exp->ex_flags & NFSEXP_FSID;
+ case FSID_UUID8:
+ case FSID_UUID16:
+ if (!is_root_export(exp))
+ return 0;
+ /* fall through */
+ case FSID_UUID4_INUM:
+ case FSID_UUID16_INUM:
+ return exp->ex_uuid != NULL;
+ }
+ return 1;
+}
- /* Choose filehandle version and fsid type based on
- * the reference filehandle (if it is in the same export)
- * or the export options.
- */
- retry:
+
+static void set_version_and_fsid_type(struct svc_fh *fhp, struct svc_export *exp, struct svc_fh *ref_fh)
+{
+ u8 version;
+ u8 fsid_type;
+retry:
version = 1;
if (ref_fh && ref_fh->fh_export == exp) {
version = ref_fh->fh_handle.fh_version;
fsid_type = ref_fh->fh_handle.fh_fsid_type;
- if (ref_fh == fhp)
- fh_put(ref_fh);
ref_fh = NULL;
switch (version) {
@@ -447,58 +454,66 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
goto retry;
}
- /* Need to check that this type works for this
- * export point. As the fsid -> filesystem mapping
- * was guided by user-space, there is no guarantee
- * that the filesystem actually supports that fsid
- * type. If it doesn't we loop around again without
- * ref_fh set.
+ /*
+ * As the fsid -> filesystem mapping was guided by
+ * user-space, there is no guarantee that the filesystem
+ * actually supports that fsid type. If it doesn't we
+ * loop around again without ref_fh set.
*/
- switch(fsid_type) {
- case FSID_DEV:
- if (!old_valid_dev(ex_dev))
- goto retry;
- /* FALL THROUGH */
- case FSID_MAJOR_MINOR:
- case FSID_ENCODE_DEV:
- if (!(exp->ex_path.dentry->d_inode->i_sb->s_type->fs_flags
- & FS_REQUIRES_DEV))
- goto retry;
- break;
- case FSID_NUM:
- if (! (exp->ex_flags & NFSEXP_FSID))
- goto retry;
- break;
- case FSID_UUID8:
- case FSID_UUID16:
- if (!root_export)
- goto retry;
- /* fall through */
- case FSID_UUID4_INUM:
- case FSID_UUID16_INUM:
- if (exp->ex_uuid == NULL)
- goto retry;
- break;
- }
+ if (!fsid_type_ok_for_exp(fsid_type, exp))
+ goto retry;
} else if (exp->ex_flags & NFSEXP_FSID) {
fsid_type = FSID_NUM;
} else if (exp->ex_uuid) {
if (fhp->fh_maxsize >= 64) {
- if (root_export)
+ if (is_root_export(exp))
fsid_type = FSID_UUID16;
else
fsid_type = FSID_UUID16_INUM;
} else {
- if (root_export)
+ if (is_root_export(exp))
fsid_type = FSID_UUID8;
else
fsid_type = FSID_UUID4_INUM;
}
- } else if (!old_valid_dev(ex_dev))
+ } else if (!old_valid_dev(exp_sb(exp)->s_dev))
/* for newer device numbers, we must use a newer fsid format */
fsid_type = FSID_ENCODE_DEV;
else
fsid_type = FSID_DEV;
+ fhp->fh_handle.fh_version = version;
+ if (version)
+ fhp->fh_handle.fh_fsid_type = fsid_type;
+}
+
+__be32
+fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
+ struct svc_fh *ref_fh)
+{
+ /* ref_fh is a reference file handle.
+ * if it is non-null and for the same filesystem, then we should compose
+ * a filehandle which is of the same version, where possible.
+ * Currently, that means that if ref_fh->fh_handle.fh_version == 0xca
+ * Then create a 32byte filehandle using nfs_fhbase_old
+ *
+ */
+
+ struct inode * inode = dentry->d_inode;
+ struct dentry *parent = dentry->d_parent;
+ __u32 *datap;
+ dev_t ex_dev = exp_sb(exp)->s_dev;
+
+ dprintk("nfsd: fh_compose(exp %02x:%02x/%ld %s/%s, ino=%ld)\n",
+ MAJOR(ex_dev), MINOR(ex_dev),
+ (long) exp->ex_path.dentry->d_inode->i_ino,
+ parent->d_name.name, dentry->d_name.name,
+ (inode ? inode->i_ino : 0));
+
+ /* Choose filehandle version and fsid type based on
+ * the reference filehandle (if it is in the same export)
+ * or the export options.
+ */
+ set_version_and_fsid_type(fhp, exp, ref_fh);
if (ref_fh == fhp)
fh_put(ref_fh);
@@ -516,7 +531,7 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
fhp->fh_export = exp;
cache_get(&exp->h);
- if (version == 0xca) {
+ if (fhp->fh_handle.fh_version == 0xca) {
/* old style filehandle please */
memset(&fhp->fh_handle.fh_base, 0, NFS_FHSIZE);
fhp->fh_handle.fh_size = NFS_FHSIZE;
@@ -530,22 +545,22 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
_fh_update_old(dentry, exp, &fhp->fh_handle);
} else {
int len;
- fhp->fh_handle.fh_version = 1;
fhp->fh_handle.fh_auth_type = 0;
datap = fhp->fh_handle.fh_auth+0;
- fhp->fh_handle.fh_fsid_type = fsid_type;
- mk_fsid(fsid_type, datap, ex_dev,
+ mk_fsid(fhp->fh_handle.fh_fsid_type, datap, ex_dev,
exp->ex_path.dentry->d_inode->i_ino,
exp->ex_fsid, exp->ex_uuid);
- len = key_len(fsid_type);
+ len = key_len(fhp->fh_handle.fh_fsid_type);
datap += len/4;
fhp->fh_handle.fh_size = 4 + len;
if (inode)
_fh_update(fhp, exp, dentry);
- if (fhp->fh_handle.fh_fileid_type == 255)
+ if (fhp->fh_handle.fh_fileid_type == 255) {
+ fh_put(fhp);
return nfserr_opnotsupp;
+ }
}
return 0;
@@ -639,8 +654,7 @@ enum fsid_source fsid_source(struct svc_fh *fhp)
case FSID_DEV:
case FSID_ENCODE_DEV:
case FSID_MAJOR_MINOR:
- if (fhp->fh_export->ex_path.dentry->d_inode->i_sb->s_type->fs_flags
- & FS_REQUIRES_DEV)
+ if (exp_sb(fhp->fh_export)->s_type->fs_flags & FS_REQUIRES_DEV)
return FSIDSOURCE_DEV;
break;
case FSID_NUM:
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 492c79b7800b..67ea83eedd43 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -34,6 +34,7 @@
#include <linux/nfsd/syscall.h>
#include <linux/lockd/bind.h>
#include <linux/nfsacl.h>
+#include <linux/seq_file.h>
#define NFSDDBG_FACILITY NFSDDBG_SVC
@@ -66,6 +67,16 @@ struct timeval nfssvc_boot;
DEFINE_MUTEX(nfsd_mutex);
struct svc_serv *nfsd_serv;
+/*
+ * nfsd_drc_lock protects nfsd_drc_max_pages and nfsd_drc_pages_used.
+ * nfsd_drc_max_pages limits the total amount of memory available for
+ * version 4.1 DRC caches.
+ * nfsd_drc_pages_used tracks the current version 4.1 DRC memory usage.
+ */
+spinlock_t nfsd_drc_lock;
+unsigned int nfsd_drc_max_mem;
+unsigned int nfsd_drc_mem_used;
+
#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
static struct svc_stat nfsd_acl_svcstats;
static struct svc_version * nfsd_acl_version[] = {
@@ -235,13 +246,12 @@ void nfsd_reset_versions(void)
*/
static void set_max_drc(void)
{
- /* The percent of nr_free_buffer_pages used by the V4.1 server DRC */
- #define NFSD_DRC_SIZE_SHIFT 7
- nfsd_serv->sv_drc_max_pages = nr_free_buffer_pages()
- >> NFSD_DRC_SIZE_SHIFT;
- nfsd_serv->sv_drc_pages_used = 0;
- dprintk("%s svc_drc_max_pages %u\n", __func__,
- nfsd_serv->sv_drc_max_pages);
+ #define NFSD_DRC_SIZE_SHIFT 10
+ nfsd_drc_max_mem = (nr_free_buffer_pages()
+ >> NFSD_DRC_SIZE_SHIFT) * PAGE_SIZE;
+ nfsd_drc_mem_used = 0;
+ spin_lock_init(&nfsd_drc_lock);
+ dprintk("%s nfsd_drc_max_mem %u \n", __func__, nfsd_drc_max_mem);
}
int nfsd_create_serv(void)
@@ -401,7 +411,9 @@ nfsd_svc(unsigned short port, int nrservs)
error = nfsd_racache_init(2*nrservs);
if (error<0)
goto out;
- nfs4_state_start();
+ error = nfs4_state_start();
+ if (error)
+ goto out;
nfsd_reset_versions();
@@ -496,7 +508,9 @@ nfsd(void *vrqstp)
/* Lock the export hash tables for reading. */
exp_readlock();
+ validate_process_creds();
svc_process(rqstp);
+ validate_process_creds();
/* Unlock export hash tables */
exp_readunlock();
@@ -567,10 +581,6 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
+ rqstp->rq_res.head[0].iov_len;
rqstp->rq_res.head[0].iov_len += sizeof(__be32);
- /* NFSv4.1 DRC requires statp */
- if (rqstp->rq_vers == 4)
- nfsd4_set_statp(rqstp, statp);
-
/* Now call the procedure handler, and encode NFS status. */
nfserr = proc->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp);
nfserr = map_new_errors(rqstp->rq_vers, nfserr);
@@ -605,7 +615,25 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
int nfsd_pool_stats_open(struct inode *inode, struct file *file)
{
- if (nfsd_serv == NULL)
+ int ret;
+ mutex_lock(&nfsd_mutex);
+ if (nfsd_serv == NULL) {
+ mutex_unlock(&nfsd_mutex);
return -ENODEV;
- return svc_pool_stats_open(nfsd_serv, file);
+ }
+ /* bump up the psudo refcount while traversing */
+ svc_get(nfsd_serv);
+ ret = svc_pool_stats_open(nfsd_serv, file);
+ mutex_unlock(&nfsd_mutex);
+ return ret;
+}
+
+int nfsd_pool_stats_release(struct inode *inode, struct file *file)
+{
+ int ret = seq_release(inode, file);
+ mutex_lock(&nfsd_mutex);
+ /* this function really, really should have been called svc_put() */
+ svc_destroy(nfsd_serv);
+ mutex_unlock(&nfsd_mutex);
+ return ret;
}
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 23341c1063bc..a293f0273263 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -89,6 +89,12 @@ struct raparm_hbucket {
#define RAPARM_HASH_MASK (RAPARM_HASH_SIZE-1)
static struct raparm_hbucket raparm_hash[RAPARM_HASH_SIZE];
+static inline int
+nfsd_v4client(struct svc_rqst *rq)
+{
+ return rq->rq_prog == NFS_PROGRAM && rq->rq_vers == 4;
+}
+
/*
* Called from nfsd_lookup and encode_dirent. Check if we have crossed
* a mount point.
@@ -115,7 +121,8 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
path_put(&path);
goto out;
}
- if ((exp->ex_flags & NFSEXP_CROSSMOUNT) || EX_NOHIDE(exp2)) {
+ if (nfsd_v4client(rqstp) ||
+ (exp->ex_flags & NFSEXP_CROSSMOUNT) || EX_NOHIDE(exp2)) {
/* successfully crossed mount point */
/*
* This is subtle: path.dentry is *not* on path.mnt
@@ -684,6 +691,8 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
__be32 err;
int host_err;
+ validate_process_creds();
+
/*
* If we get here, then the client has already done an "open",
* and (hopefully) checked permission - so allow OWNER_OVERRIDE
@@ -740,6 +749,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
out_nfserr:
err = nfserrno(host_err);
out:
+ validate_process_creds();
return err;
}
diff --git a/fs/nilfs2/Kconfig b/fs/nilfs2/Kconfig
index 72da095d4009..251da07b2a1d 100644
--- a/fs/nilfs2/Kconfig
+++ b/fs/nilfs2/Kconfig
@@ -1,6 +1,6 @@
config NILFS2_FS
tristate "NILFS2 file system support (EXPERIMENTAL)"
- depends on BLOCK && EXPERIMENTAL
+ depends on EXPERIMENTAL
select CRC32
help
NILFS2 is a log-structured file system (LFS) supporting continuous
diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
index 99d58a028b94..08834df6ec68 100644
--- a/fs/nilfs2/bmap.c
+++ b/fs/nilfs2/bmap.c
@@ -36,6 +36,26 @@ struct inode *nilfs_bmap_get_dat(const struct nilfs_bmap *bmap)
return nilfs_dat_inode(NILFS_I_NILFS(bmap->b_inode));
}
+/**
+ * nilfs_bmap_lookup_at_level - find a data block or node block
+ * @bmap: bmap
+ * @key: key
+ * @level: level
+ * @ptrp: place to store the value associated to @key
+ *
+ * Description: nilfs_bmap_lookup_at_level() finds a record whose key
+ * matches @key in the block at @level of the bmap.
+ *
+ * Return Value: On success, 0 is returned and the record associated with @key
+ * is stored in the place pointed by @ptrp. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOENT - A record associated with @key does not exist.
+ */
int nilfs_bmap_lookup_at_level(struct nilfs_bmap *bmap, __u64 key, int level,
__u64 *ptrp)
{
@@ -69,39 +89,6 @@ int nilfs_bmap_lookup_contig(struct nilfs_bmap *bmap, __u64 key, __u64 *ptrp,
return ret;
}
-/**
- * nilfs_bmap_lookup - find a record
- * @bmap: bmap
- * @key: key
- * @recp: pointer to record
- *
- * Description: nilfs_bmap_lookup() finds a record whose key matches @key in
- * @bmap.
- *
- * Return Value: On success, 0 is returned and the record associated with @key
- * is stored in the place pointed by @recp. On error, one of the following
- * negative error codes is returned.
- *
- * %-EIO - I/O error.
- *
- * %-ENOMEM - Insufficient amount of memory available.
- *
- * %-ENOENT - A record associated with @key does not exist.
- */
-int nilfs_bmap_lookup(struct nilfs_bmap *bmap,
- unsigned long key,
- unsigned long *recp)
-{
- __u64 ptr;
- int ret;
-
- /* XXX: use macro for level 1 */
- ret = nilfs_bmap_lookup_at_level(bmap, key, 1, &ptr);
- if (recp != NULL)
- *recp = ptr;
- return ret;
-}
-
static int nilfs_bmap_do_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
{
__u64 keys[NILFS_BMAP_SMALL_HIGH + 1];
@@ -469,104 +456,6 @@ __u64 nilfs_bmap_find_target_in_group(const struct nilfs_bmap *bmap)
(entries_per_group / NILFS_BMAP_GROUP_DIV);
}
-int nilfs_bmap_prepare_alloc_v(struct nilfs_bmap *bmap,
- union nilfs_bmap_ptr_req *req)
-{
- return nilfs_dat_prepare_alloc(nilfs_bmap_get_dat(bmap), &req->bpr_req);
-}
-
-void nilfs_bmap_commit_alloc_v(struct nilfs_bmap *bmap,
- union nilfs_bmap_ptr_req *req)
-{
- nilfs_dat_commit_alloc(nilfs_bmap_get_dat(bmap), &req->bpr_req);
-}
-
-void nilfs_bmap_abort_alloc_v(struct nilfs_bmap *bmap,
- union nilfs_bmap_ptr_req *req)
-{
- nilfs_dat_abort_alloc(nilfs_bmap_get_dat(bmap), &req->bpr_req);
-}
-
-int nilfs_bmap_start_v(struct nilfs_bmap *bmap, union nilfs_bmap_ptr_req *req,
- sector_t blocknr)
-{
- struct inode *dat = nilfs_bmap_get_dat(bmap);
- int ret;
-
- ret = nilfs_dat_prepare_start(dat, &req->bpr_req);
- if (likely(!ret))
- nilfs_dat_commit_start(dat, &req->bpr_req, blocknr);
- return ret;
-}
-
-int nilfs_bmap_prepare_end_v(struct nilfs_bmap *bmap,
- union nilfs_bmap_ptr_req *req)
-{
- return nilfs_dat_prepare_end(nilfs_bmap_get_dat(bmap), &req->bpr_req);
-}
-
-void nilfs_bmap_commit_end_v(struct nilfs_bmap *bmap,
- union nilfs_bmap_ptr_req *req)
-{
- nilfs_dat_commit_end(nilfs_bmap_get_dat(bmap), &req->bpr_req,
- bmap->b_ptr_type == NILFS_BMAP_PTR_VS);
-}
-
-void nilfs_bmap_abort_end_v(struct nilfs_bmap *bmap,
- union nilfs_bmap_ptr_req *req)
-{
- nilfs_dat_abort_end(nilfs_bmap_get_dat(bmap), &req->bpr_req);
-}
-
-int nilfs_bmap_move_v(const struct nilfs_bmap *bmap, __u64 vblocknr,
- sector_t blocknr)
-{
- return nilfs_dat_move(nilfs_bmap_get_dat(bmap), vblocknr, blocknr);
-}
-
-int nilfs_bmap_mark_dirty(const struct nilfs_bmap *bmap, __u64 vblocknr)
-{
- return nilfs_dat_mark_dirty(nilfs_bmap_get_dat(bmap), vblocknr);
-}
-
-int nilfs_bmap_prepare_update_v(struct nilfs_bmap *bmap,
- union nilfs_bmap_ptr_req *oldreq,
- union nilfs_bmap_ptr_req *newreq)
-{
- struct inode *dat = nilfs_bmap_get_dat(bmap);
- int ret;
-
- ret = nilfs_dat_prepare_end(dat, &oldreq->bpr_req);
- if (ret < 0)
- return ret;
- ret = nilfs_dat_prepare_alloc(dat, &newreq->bpr_req);
- if (ret < 0)
- nilfs_dat_abort_end(dat, &oldreq->bpr_req);
-
- return ret;
-}
-
-void nilfs_bmap_commit_update_v(struct nilfs_bmap *bmap,
- union nilfs_bmap_ptr_req *oldreq,
- union nilfs_bmap_ptr_req *newreq)
-{
- struct inode *dat = nilfs_bmap_get_dat(bmap);
-
- nilfs_dat_commit_end(dat, &oldreq->bpr_req,
- bmap->b_ptr_type == NILFS_BMAP_PTR_VS);
- nilfs_dat_commit_alloc(dat, &newreq->bpr_req);
-}
-
-void nilfs_bmap_abort_update_v(struct nilfs_bmap *bmap,
- union nilfs_bmap_ptr_req *oldreq,
- union nilfs_bmap_ptr_req *newreq)
-{
- struct inode *dat = nilfs_bmap_get_dat(bmap);
-
- nilfs_dat_abort_end(dat, &oldreq->bpr_req);
- nilfs_dat_abort_alloc(dat, &newreq->bpr_req);
-}
-
static struct lock_class_key nilfs_bmap_dat_lock_key;
static struct lock_class_key nilfs_bmap_mdt_lock_key;
diff --git a/fs/nilfs2/bmap.h b/fs/nilfs2/bmap.h
index b2890cdcef12..9980d7dbab91 100644
--- a/fs/nilfs2/bmap.h
+++ b/fs/nilfs2/bmap.h
@@ -28,6 +28,7 @@
#include <linux/buffer_head.h>
#include <linux/nilfs2_fs.h>
#include "alloc.h"
+#include "dat.h"
#define NILFS_BMAP_INVALID_PTR 0
@@ -141,7 +142,6 @@ struct nilfs_bmap {
int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *);
int nilfs_bmap_read(struct nilfs_bmap *, struct nilfs_inode *);
void nilfs_bmap_write(struct nilfs_bmap *, struct nilfs_inode *);
-int nilfs_bmap_lookup(struct nilfs_bmap *, unsigned long, unsigned long *);
int nilfs_bmap_lookup_contig(struct nilfs_bmap *, __u64, __u64 *, unsigned);
int nilfs_bmap_insert(struct nilfs_bmap *, unsigned long, unsigned long);
int nilfs_bmap_delete(struct nilfs_bmap *, unsigned long);
@@ -160,90 +160,76 @@ void nilfs_bmap_init_gcdat(struct nilfs_bmap *, struct nilfs_bmap *);
void nilfs_bmap_commit_gcdat(struct nilfs_bmap *, struct nilfs_bmap *);
+static inline int nilfs_bmap_lookup(struct nilfs_bmap *bmap, __u64 key,
+ __u64 *ptr)
+{
+ return nilfs_bmap_lookup_at_level(bmap, key, 1, ptr);
+}
+
/*
* Internal use only
*/
struct inode *nilfs_bmap_get_dat(const struct nilfs_bmap *);
-int nilfs_bmap_prepare_alloc_v(struct nilfs_bmap *,
- union nilfs_bmap_ptr_req *);
-void nilfs_bmap_commit_alloc_v(struct nilfs_bmap *,
- union nilfs_bmap_ptr_req *);
-void nilfs_bmap_abort_alloc_v(struct nilfs_bmap *,
- union nilfs_bmap_ptr_req *);
static inline int nilfs_bmap_prepare_alloc_ptr(struct nilfs_bmap *bmap,
- union nilfs_bmap_ptr_req *req)
+ union nilfs_bmap_ptr_req *req,
+ struct inode *dat)
{
- if (NILFS_BMAP_USE_VBN(bmap))
- return nilfs_bmap_prepare_alloc_v(bmap, req);
+ if (dat)
+ return nilfs_dat_prepare_alloc(dat, &req->bpr_req);
/* ignore target ptr */
req->bpr_ptr = bmap->b_last_allocated_ptr++;
return 0;
}
static inline void nilfs_bmap_commit_alloc_ptr(struct nilfs_bmap *bmap,
- union nilfs_bmap_ptr_req *req)
+ union nilfs_bmap_ptr_req *req,
+ struct inode *dat)
{
- if (NILFS_BMAP_USE_VBN(bmap))
- nilfs_bmap_commit_alloc_v(bmap, req);
+ if (dat)
+ nilfs_dat_commit_alloc(dat, &req->bpr_req);
}
static inline void nilfs_bmap_abort_alloc_ptr(struct nilfs_bmap *bmap,
- union nilfs_bmap_ptr_req *req)
+ union nilfs_bmap_ptr_req *req,
+ struct inode *dat)
{
- if (NILFS_BMAP_USE_VBN(bmap))
- nilfs_bmap_abort_alloc_v(bmap, req);
+ if (dat)
+ nilfs_dat_abort_alloc(dat, &req->bpr_req);
else
bmap->b_last_allocated_ptr--;
}
-int nilfs_bmap_prepare_end_v(struct nilfs_bmap *, union nilfs_bmap_ptr_req *);
-void nilfs_bmap_commit_end_v(struct nilfs_bmap *, union nilfs_bmap_ptr_req *);
-void nilfs_bmap_abort_end_v(struct nilfs_bmap *, union nilfs_bmap_ptr_req *);
-
static inline int nilfs_bmap_prepare_end_ptr(struct nilfs_bmap *bmap,
- union nilfs_bmap_ptr_req *req)
+ union nilfs_bmap_ptr_req *req,
+ struct inode *dat)
{
- return NILFS_BMAP_USE_VBN(bmap) ?
- nilfs_bmap_prepare_end_v(bmap, req) : 0;
+ return dat ? nilfs_dat_prepare_end(dat, &req->bpr_req) : 0;
}
static inline void nilfs_bmap_commit_end_ptr(struct nilfs_bmap *bmap,
- union nilfs_bmap_ptr_req *req)
+ union nilfs_bmap_ptr_req *req,
+ struct inode *dat)
{
- if (NILFS_BMAP_USE_VBN(bmap))
- nilfs_bmap_commit_end_v(bmap, req);
+ if (dat)
+ nilfs_dat_commit_end(dat, &req->bpr_req,
+ bmap->b_ptr_type == NILFS_BMAP_PTR_VS);
}
static inline void nilfs_bmap_abort_end_ptr(struct nilfs_bmap *bmap,
- union nilfs_bmap_ptr_req *req)
+ union nilfs_bmap_ptr_req *req,
+ struct inode *dat)
{
- if (NILFS_BMAP_USE_VBN(bmap))
- nilfs_bmap_abort_end_v(bmap, req);
+ if (dat)
+ nilfs_dat_abort_end(dat, &req->bpr_req);
}
-int nilfs_bmap_start_v(struct nilfs_bmap *, union nilfs_bmap_ptr_req *,
- sector_t);
-int nilfs_bmap_move_v(const struct nilfs_bmap *, __u64, sector_t);
-int nilfs_bmap_mark_dirty(const struct nilfs_bmap *, __u64);
-
-
__u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *,
const struct buffer_head *);
__u64 nilfs_bmap_find_target_seq(const struct nilfs_bmap *, __u64);
__u64 nilfs_bmap_find_target_in_group(const struct nilfs_bmap *);
-int nilfs_bmap_prepare_update_v(struct nilfs_bmap *,
- union nilfs_bmap_ptr_req *,
- union nilfs_bmap_ptr_req *);
-void nilfs_bmap_commit_update_v(struct nilfs_bmap *,
- union nilfs_bmap_ptr_req *,
- union nilfs_bmap_ptr_req *);
-void nilfs_bmap_abort_update_v(struct nilfs_bmap *,
- union nilfs_bmap_ptr_req *,
- union nilfs_bmap_ptr_req *);
-
void nilfs_bmap_add_blocks(const struct nilfs_bmap *, int);
void nilfs_bmap_sub_blocks(const struct nilfs_bmap *, int);
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index 7e0b61be212e..6a2711f4c321 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -46,7 +46,7 @@ void nilfs_btnode_cache_init_once(struct address_space *btnc)
INIT_LIST_HEAD(&btnc->i_mmap_nonlinear);
}
-static struct address_space_operations def_btnode_aops = {
+static const struct address_space_operations def_btnode_aops = {
.sync_page = block_sync_page,
};
@@ -209,6 +209,7 @@ int nilfs_btnode_prepare_change_key(struct address_space *btnc,
* We cannot call radix_tree_preload for the kernels older
* than 2.6.23, because it is not exported for modules.
*/
+retry:
err = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
if (err)
goto failed_unlock;
@@ -219,7 +220,6 @@ int nilfs_btnode_prepare_change_key(struct address_space *btnc,
(unsigned long long)oldkey,
(unsigned long long)newkey);
-retry:
spin_lock_irq(&btnc->tree_lock);
err = radix_tree_insert(&btnc->page_tree, newkey, obh->b_page);
spin_unlock_irq(&btnc->tree_lock);
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index aa412724b64e..e25b507a474f 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -71,21 +71,17 @@ void nilfs_btree_path_cache_destroy(void)
kmem_cache_destroy(nilfs_btree_path_cache);
}
-static inline struct nilfs_btree_path *
-nilfs_btree_alloc_path(const struct nilfs_btree *btree)
+static inline struct nilfs_btree_path *nilfs_btree_alloc_path(void)
{
- return (struct nilfs_btree_path *)
- kmem_cache_alloc(nilfs_btree_path_cache, GFP_NOFS);
+ return kmem_cache_alloc(nilfs_btree_path_cache, GFP_NOFS);
}
-static inline void nilfs_btree_free_path(const struct nilfs_btree *btree,
- struct nilfs_btree_path *path)
+static inline void nilfs_btree_free_path(struct nilfs_btree_path *path)
{
kmem_cache_free(nilfs_btree_path_cache, path);
}
-static void nilfs_btree_init_path(const struct nilfs_btree *btree,
- struct nilfs_btree_path *path)
+static void nilfs_btree_init_path(struct nilfs_btree_path *path)
{
int level;
@@ -101,26 +97,13 @@ static void nilfs_btree_init_path(const struct nilfs_btree *btree,
}
}
-static void nilfs_btree_clear_path(const struct nilfs_btree *btree,
- struct nilfs_btree_path *path)
+static void nilfs_btree_release_path(struct nilfs_btree_path *path)
{
int level;
- for (level = NILFS_BTREE_LEVEL_DATA;
- level < NILFS_BTREE_LEVEL_MAX;
- level++) {
- if (path[level].bp_bh != NULL) {
- brelse(path[level].bp_bh);
- path[level].bp_bh = NULL;
- }
- /* sib_bh is released or deleted by prepare or commit
- * operations. */
- path[level].bp_sib_bh = NULL;
- path[level].bp_index = 0;
- path[level].bp_oldreq.bpr_ptr = NILFS_BMAP_INVALID_PTR;
- path[level].bp_newreq.bpr_ptr = NILFS_BMAP_INVALID_PTR;
- path[level].bp_op = NULL;
- }
+ for (level = NILFS_BTREE_LEVEL_DATA; level < NILFS_BTREE_LEVEL_MAX;
+ level++)
+ brelse(path[level].bp_bh);
}
/*
@@ -148,129 +131,110 @@ static int nilfs_btree_get_new_block(const struct nilfs_btree *btree,
}
static inline int
-nilfs_btree_node_get_flags(const struct nilfs_btree *btree,
- const struct nilfs_btree_node *node)
+nilfs_btree_node_get_flags(const struct nilfs_btree_node *node)
{
return node->bn_flags;
}
static inline void
-nilfs_btree_node_set_flags(struct nilfs_btree *btree,
- struct nilfs_btree_node *node,
- int flags)
+nilfs_btree_node_set_flags(struct nilfs_btree_node *node, int flags)
{
node->bn_flags = flags;
}
-static inline int nilfs_btree_node_root(const struct nilfs_btree *btree,
- const struct nilfs_btree_node *node)
+static inline int nilfs_btree_node_root(const struct nilfs_btree_node *node)
{
- return nilfs_btree_node_get_flags(btree, node) & NILFS_BTREE_NODE_ROOT;
+ return nilfs_btree_node_get_flags(node) & NILFS_BTREE_NODE_ROOT;
}
static inline int
-nilfs_btree_node_get_level(const struct nilfs_btree *btree,
- const struct nilfs_btree_node *node)
+nilfs_btree_node_get_level(const struct nilfs_btree_node *node)
{
return node->bn_level;
}
static inline void
-nilfs_btree_node_set_level(struct nilfs_btree *btree,
- struct nilfs_btree_node *node,
- int level)
+nilfs_btree_node_set_level(struct nilfs_btree_node *node, int level)
{
node->bn_level = level;
}
static inline int
-nilfs_btree_node_get_nchildren(const struct nilfs_btree *btree,
- const struct nilfs_btree_node *node)
+nilfs_btree_node_get_nchildren(const struct nilfs_btree_node *node)
{
return le16_to_cpu(node->bn_nchildren);
}
static inline void
-nilfs_btree_node_set_nchildren(struct nilfs_btree *btree,
- struct nilfs_btree_node *node,
- int nchildren)
+nilfs_btree_node_set_nchildren(struct nilfs_btree_node *node, int nchildren)
{
node->bn_nchildren = cpu_to_le16(nchildren);
}
-static inline int
-nilfs_btree_node_size(const struct nilfs_btree *btree)
+static inline int nilfs_btree_node_size(const struct nilfs_btree *btree)
{
return 1 << btree->bt_bmap.b_inode->i_blkbits;
}
static inline int
-nilfs_btree_node_nchildren_min(const struct nilfs_btree *btree,
- const struct nilfs_btree_node *node)
+nilfs_btree_node_nchildren_min(const struct nilfs_btree_node *node,
+ const struct nilfs_btree *btree)
{
- return nilfs_btree_node_root(btree, node) ?
+ return nilfs_btree_node_root(node) ?
NILFS_BTREE_ROOT_NCHILDREN_MIN :
NILFS_BTREE_NODE_NCHILDREN_MIN(nilfs_btree_node_size(btree));
}
static inline int
-nilfs_btree_node_nchildren_max(const struct nilfs_btree *btree,
- const struct nilfs_btree_node *node)
+nilfs_btree_node_nchildren_max(const struct nilfs_btree_node *node,
+ const struct nilfs_btree *btree)
{
- return nilfs_btree_node_root(btree, node) ?
+ return nilfs_btree_node_root(node) ?
NILFS_BTREE_ROOT_NCHILDREN_MAX :
NILFS_BTREE_NODE_NCHILDREN_MAX(nilfs_btree_node_size(btree));
}
static inline __le64 *
-nilfs_btree_node_dkeys(const struct nilfs_btree *btree,
- const struct nilfs_btree_node *node)
+nilfs_btree_node_dkeys(const struct nilfs_btree_node *node)
{
return (__le64 *)((char *)(node + 1) +
- (nilfs_btree_node_root(btree, node) ?
+ (nilfs_btree_node_root(node) ?
0 : NILFS_BTREE_NODE_EXTRA_PAD_SIZE));
}
static inline __le64 *
-nilfs_btree_node_dptrs(const struct nilfs_btree *btree,
- const struct nilfs_btree_node *node)
+nilfs_btree_node_dptrs(const struct nilfs_btree_node *node,
+ const struct nilfs_btree *btree)
{
- return (__le64 *)(nilfs_btree_node_dkeys(btree, node) +
- nilfs_btree_node_nchildren_max(btree, node));
+ return (__le64 *)(nilfs_btree_node_dkeys(node) +
+ nilfs_btree_node_nchildren_max(node, btree));
}
static inline __u64
-nilfs_btree_node_get_key(const struct nilfs_btree *btree,
- const struct nilfs_btree_node *node, int index)
+nilfs_btree_node_get_key(const struct nilfs_btree_node *node, int index)
{
- return nilfs_bmap_dkey_to_key(*(nilfs_btree_node_dkeys(btree, node) +
- index));
+ return nilfs_bmap_dkey_to_key(*(nilfs_btree_node_dkeys(node) + index));
}
static inline void
-nilfs_btree_node_set_key(struct nilfs_btree *btree,
- struct nilfs_btree_node *node, int index, __u64 key)
+nilfs_btree_node_set_key(struct nilfs_btree_node *node, int index, __u64 key)
{
- *(nilfs_btree_node_dkeys(btree, node) + index) =
- nilfs_bmap_key_to_dkey(key);
+ *(nilfs_btree_node_dkeys(node) + index) = nilfs_bmap_key_to_dkey(key);
}
static inline __u64
nilfs_btree_node_get_ptr(const struct nilfs_btree *btree,
- const struct nilfs_btree_node *node,
- int index)
+ const struct nilfs_btree_node *node, int index)
{
- return nilfs_bmap_dptr_to_ptr(*(nilfs_btree_node_dptrs(btree, node) +
+ return nilfs_bmap_dptr_to_ptr(*(nilfs_btree_node_dptrs(node, btree) +
index));
}
static inline void
nilfs_btree_node_set_ptr(struct nilfs_btree *btree,
- struct nilfs_btree_node *node,
- int index,
- __u64 ptr)
+ struct nilfs_btree_node *node, int index, __u64 ptr)
{
- *(nilfs_btree_node_dptrs(btree, node) + index) =
+ *(nilfs_btree_node_dptrs(node, btree) + index) =
nilfs_bmap_ptr_to_dptr(ptr);
}
@@ -283,12 +247,12 @@ static void nilfs_btree_node_init(struct nilfs_btree *btree,
__le64 *dptrs;
int i;
- nilfs_btree_node_set_flags(btree, node, flags);
- nilfs_btree_node_set_level(btree, node, level);
- nilfs_btree_node_set_nchildren(btree, node, nchildren);
+ nilfs_btree_node_set_flags(node, flags);
+ nilfs_btree_node_set_level(node, level);
+ nilfs_btree_node_set_nchildren(node, nchildren);
- dkeys = nilfs_btree_node_dkeys(btree, node);
- dptrs = nilfs_btree_node_dptrs(btree, node);
+ dkeys = nilfs_btree_node_dkeys(node);
+ dptrs = nilfs_btree_node_dptrs(node, btree);
for (i = 0; i < nchildren; i++) {
dkeys[i] = nilfs_bmap_key_to_dkey(keys[i]);
dptrs[i] = nilfs_bmap_ptr_to_dptr(ptrs[i]);
@@ -305,13 +269,13 @@ static void nilfs_btree_node_move_left(struct nilfs_btree *btree,
__le64 *ldptrs, *rdptrs;
int lnchildren, rnchildren;
- ldkeys = nilfs_btree_node_dkeys(btree, left);
- ldptrs = nilfs_btree_node_dptrs(btree, left);
- lnchildren = nilfs_btree_node_get_nchildren(btree, left);
+ ldkeys = nilfs_btree_node_dkeys(left);
+ ldptrs = nilfs_btree_node_dptrs(left, btree);
+ lnchildren = nilfs_btree_node_get_nchildren(left);
- rdkeys = nilfs_btree_node_dkeys(btree, right);
- rdptrs = nilfs_btree_node_dptrs(btree, right);
- rnchildren = nilfs_btree_node_get_nchildren(btree, right);
+ rdkeys = nilfs_btree_node_dkeys(right);
+ rdptrs = nilfs_btree_node_dptrs(right, btree);
+ rnchildren = nilfs_btree_node_get_nchildren(right);
memcpy(ldkeys + lnchildren, rdkeys, n * sizeof(*rdkeys));
memcpy(ldptrs + lnchildren, rdptrs, n * sizeof(*rdptrs));
@@ -320,8 +284,8 @@ static void nilfs_btree_node_move_left(struct nilfs_btree *btree,
lnchildren += n;
rnchildren -= n;
- nilfs_btree_node_set_nchildren(btree, left, lnchildren);
- nilfs_btree_node_set_nchildren(btree, right, rnchildren);
+ nilfs_btree_node_set_nchildren(left, lnchildren);
+ nilfs_btree_node_set_nchildren(right, rnchildren);
}
/* Assume that the buffer heads corresponding to left and right are locked. */
@@ -334,13 +298,13 @@ static void nilfs_btree_node_move_right(struct nilfs_btree *btree,
__le64 *ldptrs, *rdptrs;
int lnchildren, rnchildren;
- ldkeys = nilfs_btree_node_dkeys(btree, left);
- ldptrs = nilfs_btree_node_dptrs(btree, left);
- lnchildren = nilfs_btree_node_get_nchildren(btree, left);
+ ldkeys = nilfs_btree_node_dkeys(left);
+ ldptrs = nilfs_btree_node_dptrs(left, btree);
+ lnchildren = nilfs_btree_node_get_nchildren(left);
- rdkeys = nilfs_btree_node_dkeys(btree, right);
- rdptrs = nilfs_btree_node_dptrs(btree, right);
- rnchildren = nilfs_btree_node_get_nchildren(btree, right);
+ rdkeys = nilfs_btree_node_dkeys(right);
+ rdptrs = nilfs_btree_node_dptrs(right, btree);
+ rnchildren = nilfs_btree_node_get_nchildren(right);
memmove(rdkeys + n, rdkeys, rnchildren * sizeof(*rdkeys));
memmove(rdptrs + n, rdptrs, rnchildren * sizeof(*rdptrs));
@@ -349,8 +313,8 @@ static void nilfs_btree_node_move_right(struct nilfs_btree *btree,
lnchildren -= n;
rnchildren += n;
- nilfs_btree_node_set_nchildren(btree, left, lnchildren);
- nilfs_btree_node_set_nchildren(btree, right, rnchildren);
+ nilfs_btree_node_set_nchildren(left, lnchildren);
+ nilfs_btree_node_set_nchildren(right, rnchildren);
}
/* Assume that the buffer head corresponding to node is locked. */
@@ -362,9 +326,9 @@ static void nilfs_btree_node_insert(struct nilfs_btree *btree,
__le64 *dptrs;
int nchildren;
- dkeys = nilfs_btree_node_dkeys(btree, node);
- dptrs = nilfs_btree_node_dptrs(btree, node);
- nchildren = nilfs_btree_node_get_nchildren(btree, node);
+ dkeys = nilfs_btree_node_dkeys(node);
+ dptrs = nilfs_btree_node_dptrs(node, btree);
+ nchildren = nilfs_btree_node_get_nchildren(node);
if (index < nchildren) {
memmove(dkeys + index + 1, dkeys + index,
(nchildren - index) * sizeof(*dkeys));
@@ -374,7 +338,7 @@ static void nilfs_btree_node_insert(struct nilfs_btree *btree,
dkeys[index] = nilfs_bmap_key_to_dkey(key);
dptrs[index] = nilfs_bmap_ptr_to_dptr(ptr);
nchildren++;
- nilfs_btree_node_set_nchildren(btree, node, nchildren);
+ nilfs_btree_node_set_nchildren(node, nchildren);
}
/* Assume that the buffer head corresponding to node is locked. */
@@ -388,11 +352,11 @@ static void nilfs_btree_node_delete(struct nilfs_btree *btree,
__le64 *dptrs;
int nchildren;
- dkeys = nilfs_btree_node_dkeys(btree, node);
- dptrs = nilfs_btree_node_dptrs(btree, node);
+ dkeys = nilfs_btree_node_dkeys(node);
+ dptrs = nilfs_btree_node_dptrs(node, btree);
key = nilfs_bmap_dkey_to_key(dkeys[index]);
ptr = nilfs_bmap_dptr_to_ptr(dptrs[index]);
- nchildren = nilfs_btree_node_get_nchildren(btree, node);
+ nchildren = nilfs_btree_node_get_nchildren(node);
if (keyp != NULL)
*keyp = key;
if (ptrp != NULL)
@@ -405,11 +369,10 @@ static void nilfs_btree_node_delete(struct nilfs_btree *btree,
(nchildren - index - 1) * sizeof(*dptrs));
}
nchildren--;
- nilfs_btree_node_set_nchildren(btree, node, nchildren);
+ nilfs_btree_node_set_nchildren(node, nchildren);
}
-static int nilfs_btree_node_lookup(const struct nilfs_btree *btree,
- const struct nilfs_btree_node *node,
+static int nilfs_btree_node_lookup(const struct nilfs_btree_node *node,
__u64 key, int *indexp)
{
__u64 nkey;
@@ -417,12 +380,12 @@ static int nilfs_btree_node_lookup(const struct nilfs_btree *btree,
/* binary search */
low = 0;
- high = nilfs_btree_node_get_nchildren(btree, node) - 1;
+ high = nilfs_btree_node_get_nchildren(node) - 1;
index = 0;
s = 0;
while (low <= high) {
index = (low + high) / 2;
- nkey = nilfs_btree_node_get_key(btree, node, index);
+ nkey = nilfs_btree_node_get_key(node, index);
if (nkey == key) {
s = 0;
goto out;
@@ -436,9 +399,8 @@ static int nilfs_btree_node_lookup(const struct nilfs_btree *btree,
}
/* adjust index */
- if (nilfs_btree_node_get_level(btree, node) >
- NILFS_BTREE_LEVEL_NODE_MIN) {
- if ((s > 0) && (index > 0))
+ if (nilfs_btree_node_get_level(node) > NILFS_BTREE_LEVEL_NODE_MIN) {
+ if (s > 0 && index > 0)
index--;
} else if (s < 0)
index++;
@@ -456,25 +418,20 @@ nilfs_btree_get_root(const struct nilfs_btree *btree)
}
static inline struct nilfs_btree_node *
-nilfs_btree_get_nonroot_node(const struct nilfs_btree *btree,
- const struct nilfs_btree_path *path,
- int level)
+nilfs_btree_get_nonroot_node(const struct nilfs_btree_path *path, int level)
{
return (struct nilfs_btree_node *)path[level].bp_bh->b_data;
}
static inline struct nilfs_btree_node *
-nilfs_btree_get_sib_node(const struct nilfs_btree *btree,
- const struct nilfs_btree_path *path,
- int level)
+nilfs_btree_get_sib_node(const struct nilfs_btree_path *path, int level)
{
return (struct nilfs_btree_node *)path[level].bp_sib_bh->b_data;
}
static inline int nilfs_btree_height(const struct nilfs_btree *btree)
{
- return nilfs_btree_node_get_level(btree, nilfs_btree_get_root(btree))
- + 1;
+ return nilfs_btree_node_get_level(nilfs_btree_get_root(btree)) + 1;
}
static inline struct nilfs_btree_node *
@@ -484,7 +441,7 @@ nilfs_btree_get_node(const struct nilfs_btree *btree,
{
return (level == nilfs_btree_height(btree) - 1) ?
nilfs_btree_get_root(btree) :
- nilfs_btree_get_nonroot_node(btree, path, level);
+ nilfs_btree_get_nonroot_node(path, level);
}
static int nilfs_btree_do_lookup(const struct nilfs_btree *btree,
@@ -496,12 +453,11 @@ static int nilfs_btree_do_lookup(const struct nilfs_btree *btree,
int level, index, found, ret;
node = nilfs_btree_get_root(btree);
- level = nilfs_btree_node_get_level(btree, node);
- if ((level < minlevel) ||
- (nilfs_btree_node_get_nchildren(btree, node) <= 0))
+ level = nilfs_btree_node_get_level(node);
+ if (level < minlevel || nilfs_btree_node_get_nchildren(node) <= 0)
return -ENOENT;
- found = nilfs_btree_node_lookup(btree, node, key, &index);
+ found = nilfs_btree_node_lookup(node, key, &index);
ptr = nilfs_btree_node_get_ptr(btree, node, index);
path[level].bp_bh = NULL;
path[level].bp_index = index;
@@ -510,14 +466,13 @@ static int nilfs_btree_do_lookup(const struct nilfs_btree *btree,
ret = nilfs_btree_get_block(btree, ptr, &path[level].bp_bh);
if (ret < 0)
return ret;
- node = nilfs_btree_get_nonroot_node(btree, path, level);
- BUG_ON(level != nilfs_btree_node_get_level(btree, node));
+ node = nilfs_btree_get_nonroot_node(path, level);
+ BUG_ON(level != nilfs_btree_node_get_level(node));
if (!found)
- found = nilfs_btree_node_lookup(btree, node, key,
- &index);
+ found = nilfs_btree_node_lookup(node, key, &index);
else
index = 0;
- if (index < nilfs_btree_node_nchildren_max(btree, node))
+ if (index < nilfs_btree_node_nchildren_max(node, btree))
ptr = nilfs_btree_node_get_ptr(btree, node, index);
else {
WARN_ON(found || level != NILFS_BTREE_LEVEL_NODE_MIN);
@@ -544,10 +499,10 @@ static int nilfs_btree_do_lookup_last(const struct nilfs_btree *btree,
int index, level, ret;
node = nilfs_btree_get_root(btree);
- index = nilfs_btree_node_get_nchildren(btree, node) - 1;
+ index = nilfs_btree_node_get_nchildren(node) - 1;
if (index < 0)
return -ENOENT;
- level = nilfs_btree_node_get_level(btree, node);
+ level = nilfs_btree_node_get_level(node);
ptr = nilfs_btree_node_get_ptr(btree, node, index);
path[level].bp_bh = NULL;
path[level].bp_index = index;
@@ -556,15 +511,15 @@ static int nilfs_btree_do_lookup_last(const struct nilfs_btree *btree,
ret = nilfs_btree_get_block(btree, ptr, &path[level].bp_bh);
if (ret < 0)
return ret;
- node = nilfs_btree_get_nonroot_node(btree, path, level);
- BUG_ON(level != nilfs_btree_node_get_level(btree, node));
- index = nilfs_btree_node_get_nchildren(btree, node) - 1;
+ node = nilfs_btree_get_nonroot_node(path, level);
+ BUG_ON(level != nilfs_btree_node_get_level(node));
+ index = nilfs_btree_node_get_nchildren(node) - 1;
ptr = nilfs_btree_node_get_ptr(btree, node, index);
path[level].bp_index = index;
}
if (keyp != NULL)
- *keyp = nilfs_btree_node_get_key(btree, node, index);
+ *keyp = nilfs_btree_node_get_key(node, index);
if (ptrp != NULL)
*ptrp = ptr;
@@ -580,18 +535,18 @@ static int nilfs_btree_lookup(const struct nilfs_bmap *bmap,
int ret;
btree = (struct nilfs_btree *)bmap;
- path = nilfs_btree_alloc_path(btree);
+ path = nilfs_btree_alloc_path();
if (path == NULL)
return -ENOMEM;
- nilfs_btree_init_path(btree, path);
+ nilfs_btree_init_path(path);
ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level);
if (ptrp != NULL)
*ptrp = ptr;
- nilfs_btree_clear_path(btree, path);
- nilfs_btree_free_path(btree, path);
+ nilfs_btree_release_path(path);
+ nilfs_btree_free_path(path);
return ret;
}
@@ -608,10 +563,10 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap,
int level = NILFS_BTREE_LEVEL_NODE_MIN;
int ret, cnt, index, maxlevel;
- path = nilfs_btree_alloc_path(btree);
+ path = nilfs_btree_alloc_path();
if (path == NULL)
return -ENOMEM;
- nilfs_btree_init_path(btree, path);
+ nilfs_btree_init_path(path);
ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level);
if (ret < 0)
goto out;
@@ -631,8 +586,8 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap,
node = nilfs_btree_get_node(btree, path, level);
index = path[level].bp_index + 1;
for (;;) {
- while (index < nilfs_btree_node_get_nchildren(btree, node)) {
- if (nilfs_btree_node_get_key(btree, node, index) !=
+ while (index < nilfs_btree_node_get_nchildren(node)) {
+ if (nilfs_btree_node_get_key(node, index) !=
key + cnt)
goto end;
ptr2 = nilfs_btree_node_get_ptr(btree, node, index);
@@ -653,8 +608,8 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap,
/* look-up right sibling node */
node = nilfs_btree_get_node(btree, path, level + 1);
index = path[level + 1].bp_index + 1;
- if (index >= nilfs_btree_node_get_nchildren(btree, node) ||
- nilfs_btree_node_get_key(btree, node, index) != key + cnt)
+ if (index >= nilfs_btree_node_get_nchildren(node) ||
+ nilfs_btree_node_get_key(node, index) != key + cnt)
break;
ptr2 = nilfs_btree_node_get_ptr(btree, node, index);
path[level + 1].bp_index = index;
@@ -664,7 +619,7 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap,
ret = nilfs_btree_get_block(btree, ptr2, &path[level].bp_bh);
if (ret < 0)
goto out;
- node = nilfs_btree_get_nonroot_node(btree, path, level);
+ node = nilfs_btree_get_nonroot_node(path, level);
index = 0;
path[level].bp_index = index;
}
@@ -672,8 +627,8 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap,
*ptrp = ptr;
ret = cnt;
out:
- nilfs_btree_clear_path(btree, path);
- nilfs_btree_free_path(btree, path);
+ nilfs_btree_release_path(path);
+ nilfs_btree_free_path(path);
return ret;
}
@@ -685,9 +640,7 @@ static void nilfs_btree_promote_key(struct nilfs_btree *btree,
do {
lock_buffer(path[level].bp_bh);
nilfs_btree_node_set_key(
- btree,
- nilfs_btree_get_nonroot_node(
- btree, path, level),
+ nilfs_btree_get_nonroot_node(path, level),
path[level].bp_index, key);
if (!buffer_dirty(path[level].bp_bh))
nilfs_btnode_mark_dirty(path[level].bp_bh);
@@ -698,8 +651,7 @@ static void nilfs_btree_promote_key(struct nilfs_btree *btree,
/* root */
if (level == nilfs_btree_height(btree) - 1) {
- nilfs_btree_node_set_key(btree,
- nilfs_btree_get_root(btree),
+ nilfs_btree_node_set_key(nilfs_btree_get_root(btree),
path[level].bp_index, key);
}
}
@@ -712,7 +664,7 @@ static void nilfs_btree_do_insert(struct nilfs_btree *btree,
if (level < nilfs_btree_height(btree) - 1) {
lock_buffer(path[level].bp_bh);
- node = nilfs_btree_get_nonroot_node(btree, path, level);
+ node = nilfs_btree_get_nonroot_node(path, level);
nilfs_btree_node_insert(btree, node, *keyp, *ptrp,
path[level].bp_index);
if (!buffer_dirty(path[level].bp_bh))
@@ -721,8 +673,8 @@ static void nilfs_btree_do_insert(struct nilfs_btree *btree,
if (path[level].bp_index == 0)
nilfs_btree_promote_key(btree, path, level + 1,
- nilfs_btree_node_get_key(
- btree, node, 0));
+ nilfs_btree_node_get_key(node,
+ 0));
} else {
node = nilfs_btree_get_root(btree);
nilfs_btree_node_insert(btree, node, *keyp, *ptrp,
@@ -740,10 +692,10 @@ static void nilfs_btree_carry_left(struct nilfs_btree *btree,
lock_buffer(path[level].bp_bh);
lock_buffer(path[level].bp_sib_bh);
- node = nilfs_btree_get_nonroot_node(btree, path, level);
- left = nilfs_btree_get_sib_node(btree, path, level);
- nchildren = nilfs_btree_node_get_nchildren(btree, node);
- lnchildren = nilfs_btree_node_get_nchildren(btree, left);
+ node = nilfs_btree_get_nonroot_node(path, level);
+ left = nilfs_btree_get_sib_node(path, level);
+ nchildren = nilfs_btree_node_get_nchildren(node);
+ lnchildren = nilfs_btree_node_get_nchildren(left);
move = 0;
n = (nchildren + lnchildren + 1) / 2 - lnchildren;
@@ -764,7 +716,7 @@ static void nilfs_btree_carry_left(struct nilfs_btree *btree,
unlock_buffer(path[level].bp_sib_bh);
nilfs_btree_promote_key(btree, path, level + 1,
- nilfs_btree_node_get_key(btree, node, 0));
+ nilfs_btree_node_get_key(node, 0));
if (move) {
brelse(path[level].bp_bh);
@@ -791,10 +743,10 @@ static void nilfs_btree_carry_right(struct nilfs_btree *btree,
lock_buffer(path[level].bp_bh);
lock_buffer(path[level].bp_sib_bh);
- node = nilfs_btree_get_nonroot_node(btree, path, level);
- right = nilfs_btree_get_sib_node(btree, path, level);
- nchildren = nilfs_btree_node_get_nchildren(btree, node);
- rnchildren = nilfs_btree_node_get_nchildren(btree, right);
+ node = nilfs_btree_get_nonroot_node(path, level);
+ right = nilfs_btree_get_sib_node(path, level);
+ nchildren = nilfs_btree_node_get_nchildren(node);
+ rnchildren = nilfs_btree_node_get_nchildren(right);
move = 0;
n = (nchildren + rnchildren + 1) / 2 - rnchildren;
@@ -816,15 +768,14 @@ static void nilfs_btree_carry_right(struct nilfs_btree *btree,
path[level + 1].bp_index++;
nilfs_btree_promote_key(btree, path, level + 1,
- nilfs_btree_node_get_key(btree, right, 0));
+ nilfs_btree_node_get_key(right, 0));
path[level + 1].bp_index--;
if (move) {
brelse(path[level].bp_bh);
path[level].bp_bh = path[level].bp_sib_bh;
path[level].bp_sib_bh = NULL;
- path[level].bp_index -=
- nilfs_btree_node_get_nchildren(btree, node);
+ path[level].bp_index -= nilfs_btree_node_get_nchildren(node);
path[level + 1].bp_index++;
} else {
brelse(path[level].bp_sib_bh);
@@ -846,9 +797,9 @@ static void nilfs_btree_split(struct nilfs_btree *btree,
lock_buffer(path[level].bp_bh);
lock_buffer(path[level].bp_sib_bh);
- node = nilfs_btree_get_nonroot_node(btree, path, level);
- right = nilfs_btree_get_sib_node(btree, path, level);
- nchildren = nilfs_btree_node_get_nchildren(btree, node);
+ node = nilfs_btree_get_nonroot_node(path, level);
+ right = nilfs_btree_get_sib_node(path, level);
+ nchildren = nilfs_btree_node_get_nchildren(node);
move = 0;
n = (nchildren + 1) / 2;
@@ -867,16 +818,15 @@ static void nilfs_btree_split(struct nilfs_btree *btree,
unlock_buffer(path[level].bp_bh);
unlock_buffer(path[level].bp_sib_bh);
- newkey = nilfs_btree_node_get_key(btree, right, 0);
+ newkey = nilfs_btree_node_get_key(right, 0);
newptr = path[level].bp_newreq.bpr_ptr;
if (move) {
- path[level].bp_index -=
- nilfs_btree_node_get_nchildren(btree, node);
+ path[level].bp_index -= nilfs_btree_node_get_nchildren(node);
nilfs_btree_node_insert(btree, right, *keyp, *ptrp,
path[level].bp_index);
- *keyp = nilfs_btree_node_get_key(btree, right, 0);
+ *keyp = nilfs_btree_node_get_key(right, 0);
*ptrp = path[level].bp_newreq.bpr_ptr;
brelse(path[level].bp_bh);
@@ -885,7 +835,7 @@ static void nilfs_btree_split(struct nilfs_btree *btree,
} else {
nilfs_btree_do_insert(btree, path, level, keyp, ptrp);
- *keyp = nilfs_btree_node_get_key(btree, right, 0);
+ *keyp = nilfs_btree_node_get_key(right, 0);
*ptrp = path[level].bp_newreq.bpr_ptr;
brelse(path[level].bp_sib_bh);
@@ -905,12 +855,12 @@ static void nilfs_btree_grow(struct nilfs_btree *btree,
lock_buffer(path[level].bp_sib_bh);
root = nilfs_btree_get_root(btree);
- child = nilfs_btree_get_sib_node(btree, path, level);
+ child = nilfs_btree_get_sib_node(path, level);
- n = nilfs_btree_node_get_nchildren(btree, root);
+ n = nilfs_btree_node_get_nchildren(root);
nilfs_btree_node_move_right(btree, root, child, n);
- nilfs_btree_node_set_level(btree, root, level + 1);
+ nilfs_btree_node_set_level(root, level + 1);
if (!buffer_dirty(path[level].bp_sib_bh))
nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
@@ -922,7 +872,7 @@ static void nilfs_btree_grow(struct nilfs_btree *btree,
nilfs_btree_do_insert(btree, path, level, keyp, ptrp);
- *keyp = nilfs_btree_node_get_key(btree, child, 0);
+ *keyp = nilfs_btree_node_get_key(child, 0);
*ptrp = path[level].bp_newreq.bpr_ptr;
}
@@ -990,26 +940,29 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
struct nilfs_btree_node *node, *parent, *sib;
__u64 sibptr;
int pindex, level, ret;
+ struct inode *dat = NULL;
stats->bs_nblocks = 0;
level = NILFS_BTREE_LEVEL_DATA;
/* allocate a new ptr for data block */
- if (NILFS_BMAP_USE_VBN(&btree->bt_bmap))
+ if (NILFS_BMAP_USE_VBN(&btree->bt_bmap)) {
path[level].bp_newreq.bpr_ptr =
nilfs_btree_find_target_v(btree, path, key);
+ dat = nilfs_bmap_get_dat(&btree->bt_bmap);
+ }
ret = nilfs_bmap_prepare_alloc_ptr(&btree->bt_bmap,
- &path[level].bp_newreq);
+ &path[level].bp_newreq, dat);
if (ret < 0)
goto err_out_data;
for (level = NILFS_BTREE_LEVEL_NODE_MIN;
level < nilfs_btree_height(btree) - 1;
level++) {
- node = nilfs_btree_get_nonroot_node(btree, path, level);
- if (nilfs_btree_node_get_nchildren(btree, node) <
- nilfs_btree_node_nchildren_max(btree, node)) {
+ node = nilfs_btree_get_nonroot_node(path, level);
+ if (nilfs_btree_node_get_nchildren(node) <
+ nilfs_btree_node_nchildren_max(node, btree)) {
path[level].bp_op = nilfs_btree_do_insert;
stats->bs_nblocks++;
goto out;
@@ -1026,8 +979,8 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
if (ret < 0)
goto err_out_child_node;
sib = (struct nilfs_btree_node *)bh->b_data;
- if (nilfs_btree_node_get_nchildren(btree, sib) <
- nilfs_btree_node_nchildren_max(btree, sib)) {
+ if (nilfs_btree_node_get_nchildren(sib) <
+ nilfs_btree_node_nchildren_max(sib, btree)) {
path[level].bp_sib_bh = bh;
path[level].bp_op = nilfs_btree_carry_left;
stats->bs_nblocks++;
@@ -1038,15 +991,15 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
/* right sibling */
if (pindex <
- nilfs_btree_node_get_nchildren(btree, parent) - 1) {
+ nilfs_btree_node_get_nchildren(parent) - 1) {
sibptr = nilfs_btree_node_get_ptr(btree, parent,
pindex + 1);
ret = nilfs_btree_get_block(btree, sibptr, &bh);
if (ret < 0)
goto err_out_child_node;
sib = (struct nilfs_btree_node *)bh->b_data;
- if (nilfs_btree_node_get_nchildren(btree, sib) <
- nilfs_btree_node_nchildren_max(btree, sib)) {
+ if (nilfs_btree_node_get_nchildren(sib) <
+ nilfs_btree_node_nchildren_max(sib, btree)) {
path[level].bp_sib_bh = bh;
path[level].bp_op = nilfs_btree_carry_right;
stats->bs_nblocks++;
@@ -1059,7 +1012,7 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
path[level].bp_newreq.bpr_ptr =
path[level - 1].bp_newreq.bpr_ptr + 1;
ret = nilfs_bmap_prepare_alloc_ptr(&btree->bt_bmap,
- &path[level].bp_newreq);
+ &path[level].bp_newreq, dat);
if (ret < 0)
goto err_out_child_node;
ret = nilfs_btree_get_new_block(btree,
@@ -1081,8 +1034,8 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
/* root */
node = nilfs_btree_get_root(btree);
- if (nilfs_btree_node_get_nchildren(btree, node) <
- nilfs_btree_node_nchildren_max(btree, node)) {
+ if (nilfs_btree_node_get_nchildren(node) <
+ nilfs_btree_node_nchildren_max(node, btree)) {
path[level].bp_op = nilfs_btree_do_insert;
stats->bs_nblocks++;
goto out;
@@ -1091,7 +1044,7 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
/* grow */
path[level].bp_newreq.bpr_ptr = path[level - 1].bp_newreq.bpr_ptr + 1;
ret = nilfs_bmap_prepare_alloc_ptr(&btree->bt_bmap,
- &path[level].bp_newreq);
+ &path[level].bp_newreq, dat);
if (ret < 0)
goto err_out_child_node;
ret = nilfs_btree_get_new_block(btree, path[level].bp_newreq.bpr_ptr,
@@ -1119,16 +1072,18 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
/* error */
err_out_curr_node:
- nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap, &path[level].bp_newreq);
+ nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap, &path[level].bp_newreq,
+ dat);
err_out_child_node:
for (level--; level > NILFS_BTREE_LEVEL_DATA; level--) {
nilfs_btnode_delete(path[level].bp_sib_bh);
nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap,
- &path[level].bp_newreq);
+ &path[level].bp_newreq, dat);
}
- nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap, &path[level].bp_newreq);
+ nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap, &path[level].bp_newreq,
+ dat);
err_out_data:
*levelp = level;
stats->bs_nblocks = 0;
@@ -1139,16 +1094,19 @@ static void nilfs_btree_commit_insert(struct nilfs_btree *btree,
struct nilfs_btree_path *path,
int maxlevel, __u64 key, __u64 ptr)
{
+ struct inode *dat = NULL;
int level;
set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr));
ptr = path[NILFS_BTREE_LEVEL_DATA].bp_newreq.bpr_ptr;
- if (NILFS_BMAP_USE_VBN(&btree->bt_bmap))
+ if (NILFS_BMAP_USE_VBN(&btree->bt_bmap)) {
nilfs_btree_set_target_v(btree, key, ptr);
+ dat = nilfs_bmap_get_dat(&btree->bt_bmap);
+ }
for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) {
nilfs_bmap_commit_alloc_ptr(&btree->bt_bmap,
- &path[level - 1].bp_newreq);
+ &path[level - 1].bp_newreq, dat);
path[level].bp_op(btree, path, level, &key, &ptr);
}
@@ -1164,10 +1122,10 @@ static int nilfs_btree_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
int level, ret;
btree = (struct nilfs_btree *)bmap;
- path = nilfs_btree_alloc_path(btree);
+ path = nilfs_btree_alloc_path();
if (path == NULL)
return -ENOMEM;
- nilfs_btree_init_path(btree, path);
+ nilfs_btree_init_path(path);
ret = nilfs_btree_do_lookup(btree, path, key, NULL,
NILFS_BTREE_LEVEL_NODE_MIN);
@@ -1184,8 +1142,8 @@ static int nilfs_btree_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
nilfs_bmap_add_blocks(bmap, stats.bs_nblocks);
out:
- nilfs_btree_clear_path(btree, path);
- nilfs_btree_free_path(btree, path);
+ nilfs_btree_release_path(path);
+ nilfs_btree_free_path(path);
return ret;
}
@@ -1197,7 +1155,7 @@ static void nilfs_btree_do_delete(struct nilfs_btree *btree,
if (level < nilfs_btree_height(btree) - 1) {
lock_buffer(path[level].bp_bh);
- node = nilfs_btree_get_nonroot_node(btree, path, level);
+ node = nilfs_btree_get_nonroot_node(path, level);
nilfs_btree_node_delete(btree, node, keyp, ptrp,
path[level].bp_index);
if (!buffer_dirty(path[level].bp_bh))
@@ -1205,7 +1163,7 @@ static void nilfs_btree_do_delete(struct nilfs_btree *btree,
unlock_buffer(path[level].bp_bh);
if (path[level].bp_index == 0)
nilfs_btree_promote_key(btree, path, level + 1,
- nilfs_btree_node_get_key(btree, node, 0));
+ nilfs_btree_node_get_key(node, 0));
} else {
node = nilfs_btree_get_root(btree);
nilfs_btree_node_delete(btree, node, keyp, ptrp,
@@ -1225,10 +1183,10 @@ static void nilfs_btree_borrow_left(struct nilfs_btree *btree,
lock_buffer(path[level].bp_bh);
lock_buffer(path[level].bp_sib_bh);
- node = nilfs_btree_get_nonroot_node(btree, path, level);
- left = nilfs_btree_get_sib_node(btree, path, level);
- nchildren = nilfs_btree_node_get_nchildren(btree, node);
- lnchildren = nilfs_btree_node_get_nchildren(btree, left);
+ node = nilfs_btree_get_nonroot_node(path, level);
+ left = nilfs_btree_get_sib_node(path, level);
+ nchildren = nilfs_btree_node_get_nchildren(node);
+ lnchildren = nilfs_btree_node_get_nchildren(left);
n = (nchildren + lnchildren) / 2 - nchildren;
@@ -1243,7 +1201,7 @@ static void nilfs_btree_borrow_left(struct nilfs_btree *btree,
unlock_buffer(path[level].bp_sib_bh);
nilfs_btree_promote_key(btree, path, level + 1,
- nilfs_btree_node_get_key(btree, node, 0));
+ nilfs_btree_node_get_key(node, 0));
brelse(path[level].bp_sib_bh);
path[level].bp_sib_bh = NULL;
@@ -1262,10 +1220,10 @@ static void nilfs_btree_borrow_right(struct nilfs_btree *btree,
lock_buffer(path[level].bp_bh);
lock_buffer(path[level].bp_sib_bh);
- node = nilfs_btree_get_nonroot_node(btree, path, level);
- right = nilfs_btree_get_sib_node(btree, path, level);
- nchildren = nilfs_btree_node_get_nchildren(btree, node);
- rnchildren = nilfs_btree_node_get_nchildren(btree, right);
+ node = nilfs_btree_get_nonroot_node(path, level);
+ right = nilfs_btree_get_sib_node(path, level);
+ nchildren = nilfs_btree_node_get_nchildren(node);
+ rnchildren = nilfs_btree_node_get_nchildren(right);
n = (nchildren + rnchildren) / 2 - nchildren;
@@ -1281,7 +1239,7 @@ static void nilfs_btree_borrow_right(struct nilfs_btree *btree,
path[level + 1].bp_index++;
nilfs_btree_promote_key(btree, path, level + 1,
- nilfs_btree_node_get_key(btree, right, 0));
+ nilfs_btree_node_get_key(right, 0));
path[level + 1].bp_index--;
brelse(path[level].bp_sib_bh);
@@ -1300,10 +1258,10 @@ static void nilfs_btree_concat_left(struct nilfs_btree *btree,
lock_buffer(path[level].bp_bh);
lock_buffer(path[level].bp_sib_bh);
- node = nilfs_btree_get_nonroot_node(btree, path, level);
- left = nilfs_btree_get_sib_node(btree, path, level);
+ node = nilfs_btree_get_nonroot_node(path, level);
+ left = nilfs_btree_get_sib_node(path, level);
- n = nilfs_btree_node_get_nchildren(btree, node);
+ n = nilfs_btree_node_get_nchildren(node);
nilfs_btree_node_move_left(btree, left, node, n);
@@ -1316,7 +1274,7 @@ static void nilfs_btree_concat_left(struct nilfs_btree *btree,
nilfs_btnode_delete(path[level].bp_bh);
path[level].bp_bh = path[level].bp_sib_bh;
path[level].bp_sib_bh = NULL;
- path[level].bp_index += nilfs_btree_node_get_nchildren(btree, left);
+ path[level].bp_index += nilfs_btree_node_get_nchildren(left);
}
static void nilfs_btree_concat_right(struct nilfs_btree *btree,
@@ -1331,10 +1289,10 @@ static void nilfs_btree_concat_right(struct nilfs_btree *btree,
lock_buffer(path[level].bp_bh);
lock_buffer(path[level].bp_sib_bh);
- node = nilfs_btree_get_nonroot_node(btree, path, level);
- right = nilfs_btree_get_sib_node(btree, path, level);
+ node = nilfs_btree_get_nonroot_node(path, level);
+ right = nilfs_btree_get_sib_node(path, level);
- n = nilfs_btree_node_get_nchildren(btree, right);
+ n = nilfs_btree_node_get_nchildren(right);
nilfs_btree_node_move_left(btree, node, right, n);
@@ -1360,11 +1318,11 @@ static void nilfs_btree_shrink(struct nilfs_btree *btree,
lock_buffer(path[level].bp_bh);
root = nilfs_btree_get_root(btree);
- child = nilfs_btree_get_nonroot_node(btree, path, level);
+ child = nilfs_btree_get_nonroot_node(path, level);
nilfs_btree_node_delete(btree, root, NULL, NULL, 0);
- nilfs_btree_node_set_level(btree, root, level);
- n = nilfs_btree_node_get_nchildren(btree, child);
+ nilfs_btree_node_set_level(root, level);
+ n = nilfs_btree_node_get_nchildren(child);
nilfs_btree_node_move_left(btree, root, child, n);
unlock_buffer(path[level].bp_bh);
@@ -1376,7 +1334,8 @@ static void nilfs_btree_shrink(struct nilfs_btree *btree,
static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
struct nilfs_btree_path *path,
int *levelp,
- struct nilfs_bmap_stats *stats)
+ struct nilfs_bmap_stats *stats,
+ struct inode *dat)
{
struct buffer_head *bh;
struct nilfs_btree_node *node, *parent, *sib;
@@ -1388,17 +1347,17 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
for (level = NILFS_BTREE_LEVEL_NODE_MIN;
level < nilfs_btree_height(btree) - 1;
level++) {
- node = nilfs_btree_get_nonroot_node(btree, path, level);
+ node = nilfs_btree_get_nonroot_node(path, level);
path[level].bp_oldreq.bpr_ptr =
nilfs_btree_node_get_ptr(btree, node,
path[level].bp_index);
ret = nilfs_bmap_prepare_end_ptr(&btree->bt_bmap,
- &path[level].bp_oldreq);
+ &path[level].bp_oldreq, dat);
if (ret < 0)
goto err_out_child_node;
- if (nilfs_btree_node_get_nchildren(btree, node) >
- nilfs_btree_node_nchildren_min(btree, node)) {
+ if (nilfs_btree_node_get_nchildren(node) >
+ nilfs_btree_node_nchildren_min(node, btree)) {
path[level].bp_op = nilfs_btree_do_delete;
stats->bs_nblocks++;
goto out;
@@ -1415,8 +1374,8 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
if (ret < 0)
goto err_out_curr_node;
sib = (struct nilfs_btree_node *)bh->b_data;
- if (nilfs_btree_node_get_nchildren(btree, sib) >
- nilfs_btree_node_nchildren_min(btree, sib)) {
+ if (nilfs_btree_node_get_nchildren(sib) >
+ nilfs_btree_node_nchildren_min(sib, btree)) {
path[level].bp_sib_bh = bh;
path[level].bp_op = nilfs_btree_borrow_left;
stats->bs_nblocks++;
@@ -1428,7 +1387,7 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
/* continue; */
}
} else if (pindex <
- nilfs_btree_node_get_nchildren(btree, parent) - 1) {
+ nilfs_btree_node_get_nchildren(parent) - 1) {
/* right sibling */
sibptr = nilfs_btree_node_get_ptr(btree, parent,
pindex + 1);
@@ -1436,8 +1395,8 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
if (ret < 0)
goto err_out_curr_node;
sib = (struct nilfs_btree_node *)bh->b_data;
- if (nilfs_btree_node_get_nchildren(btree, sib) >
- nilfs_btree_node_nchildren_min(btree, sib)) {
+ if (nilfs_btree_node_get_nchildren(sib) >
+ nilfs_btree_node_nchildren_min(sib, btree)) {
path[level].bp_sib_bh = bh;
path[level].bp_op = nilfs_btree_borrow_right;
stats->bs_nblocks++;
@@ -1452,7 +1411,7 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
/* no siblings */
/* the only child of the root node */
WARN_ON(level != nilfs_btree_height(btree) - 2);
- if (nilfs_btree_node_get_nchildren(btree, node) - 1 <=
+ if (nilfs_btree_node_get_nchildren(node) - 1 <=
NILFS_BTREE_ROOT_NCHILDREN_MAX) {
path[level].bp_op = nilfs_btree_shrink;
stats->bs_nblocks += 2;
@@ -1471,7 +1430,7 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
nilfs_btree_node_get_ptr(btree, node, path[level].bp_index);
ret = nilfs_bmap_prepare_end_ptr(&btree->bt_bmap,
- &path[level].bp_oldreq);
+ &path[level].bp_oldreq, dat);
if (ret < 0)
goto err_out_child_node;
@@ -1486,12 +1445,12 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
/* error */
err_out_curr_node:
- nilfs_bmap_abort_end_ptr(&btree->bt_bmap, &path[level].bp_oldreq);
+ nilfs_bmap_abort_end_ptr(&btree->bt_bmap, &path[level].bp_oldreq, dat);
err_out_child_node:
for (level--; level >= NILFS_BTREE_LEVEL_NODE_MIN; level--) {
brelse(path[level].bp_sib_bh);
nilfs_bmap_abort_end_ptr(&btree->bt_bmap,
- &path[level].bp_oldreq);
+ &path[level].bp_oldreq, dat);
}
*levelp = level;
stats->bs_nblocks = 0;
@@ -1500,13 +1459,13 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
static void nilfs_btree_commit_delete(struct nilfs_btree *btree,
struct nilfs_btree_path *path,
- int maxlevel)
+ int maxlevel, struct inode *dat)
{
int level;
for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) {
nilfs_bmap_commit_end_ptr(&btree->bt_bmap,
- &path[level].bp_oldreq);
+ &path[level].bp_oldreq, dat);
path[level].bp_op(btree, path, level, NULL, NULL);
}
@@ -1520,27 +1479,32 @@ static int nilfs_btree_delete(struct nilfs_bmap *bmap, __u64 key)
struct nilfs_btree *btree;
struct nilfs_btree_path *path;
struct nilfs_bmap_stats stats;
+ struct inode *dat;
int level, ret;
btree = (struct nilfs_btree *)bmap;
- path = nilfs_btree_alloc_path(btree);
+ path = nilfs_btree_alloc_path();
if (path == NULL)
return -ENOMEM;
- nilfs_btree_init_path(btree, path);
+ nilfs_btree_init_path(path);
ret = nilfs_btree_do_lookup(btree, path, key, NULL,
NILFS_BTREE_LEVEL_NODE_MIN);
if (ret < 0)
goto out;
- ret = nilfs_btree_prepare_delete(btree, path, &level, &stats);
+
+ dat = NILFS_BMAP_USE_VBN(&btree->bt_bmap) ?
+ nilfs_bmap_get_dat(&btree->bt_bmap) : NULL;
+
+ ret = nilfs_btree_prepare_delete(btree, path, &level, &stats, dat);
if (ret < 0)
goto out;
- nilfs_btree_commit_delete(btree, path, level);
+ nilfs_btree_commit_delete(btree, path, level, dat);
nilfs_bmap_sub_blocks(bmap, stats.bs_nblocks);
out:
- nilfs_btree_clear_path(btree, path);
- nilfs_btree_free_path(btree, path);
+ nilfs_btree_release_path(path);
+ nilfs_btree_free_path(path);
return ret;
}
@@ -1551,15 +1515,15 @@ static int nilfs_btree_last_key(const struct nilfs_bmap *bmap, __u64 *keyp)
int ret;
btree = (struct nilfs_btree *)bmap;
- path = nilfs_btree_alloc_path(btree);
+ path = nilfs_btree_alloc_path();
if (path == NULL)
return -ENOMEM;
- nilfs_btree_init_path(btree, path);
+ nilfs_btree_init_path(path);
ret = nilfs_btree_do_lookup_last(btree, path, keyp, NULL);
- nilfs_btree_clear_path(btree, path);
- nilfs_btree_free_path(btree, path);
+ nilfs_btree_release_path(path);
+ nilfs_btree_free_path(path);
return ret;
}
@@ -1581,7 +1545,7 @@ static int nilfs_btree_check_delete(struct nilfs_bmap *bmap, __u64 key)
node = root;
break;
case 3:
- nchildren = nilfs_btree_node_get_nchildren(btree, root);
+ nchildren = nilfs_btree_node_get_nchildren(root);
if (nchildren > 1)
return 0;
ptr = nilfs_btree_node_get_ptr(btree, root, nchildren - 1);
@@ -1594,10 +1558,10 @@ static int nilfs_btree_check_delete(struct nilfs_bmap *bmap, __u64 key)
return 0;
}
- nchildren = nilfs_btree_node_get_nchildren(btree, node);
- maxkey = nilfs_btree_node_get_key(btree, node, nchildren - 1);
+ nchildren = nilfs_btree_node_get_nchildren(node);
+ maxkey = nilfs_btree_node_get_key(node, nchildren - 1);
nextmaxkey = (nchildren > 1) ?
- nilfs_btree_node_get_key(btree, node, nchildren - 2) : 0;
+ nilfs_btree_node_get_key(node, nchildren - 2) : 0;
if (bh != NULL)
brelse(bh);
@@ -1623,7 +1587,7 @@ static int nilfs_btree_gather_data(struct nilfs_bmap *bmap,
node = root;
break;
case 3:
- nchildren = nilfs_btree_node_get_nchildren(btree, root);
+ nchildren = nilfs_btree_node_get_nchildren(root);
WARN_ON(nchildren > 1);
ptr = nilfs_btree_node_get_ptr(btree, root, nchildren - 1);
ret = nilfs_btree_get_block(btree, ptr, &bh);
@@ -1636,11 +1600,11 @@ static int nilfs_btree_gather_data(struct nilfs_bmap *bmap,
return -EINVAL;
}
- nchildren = nilfs_btree_node_get_nchildren(btree, node);
+ nchildren = nilfs_btree_node_get_nchildren(node);
if (nchildren < nitems)
nitems = nchildren;
- dkeys = nilfs_btree_node_dkeys(btree, node);
- dptrs = nilfs_btree_node_dptrs(btree, node);
+ dkeys = nilfs_btree_node_dkeys(node);
+ dptrs = nilfs_btree_node_dptrs(node, btree);
for (i = 0; i < nitems; i++) {
keys[i] = nilfs_bmap_dkey_to_key(dkeys[i]);
ptrs[i] = nilfs_bmap_dptr_to_ptr(dptrs[i]);
@@ -1660,18 +1624,20 @@ nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key,
struct nilfs_bmap_stats *stats)
{
struct buffer_head *bh;
- struct nilfs_btree *btree;
+ struct nilfs_btree *btree = (struct nilfs_btree *)bmap;
+ struct inode *dat = NULL;
int ret;
- btree = (struct nilfs_btree *)bmap;
stats->bs_nblocks = 0;
/* for data */
/* cannot find near ptr */
- if (NILFS_BMAP_USE_VBN(bmap))
+ if (NILFS_BMAP_USE_VBN(bmap)) {
dreq->bpr_ptr = nilfs_btree_find_target_v(btree, NULL, key);
+ dat = nilfs_bmap_get_dat(bmap);
+ }
- ret = nilfs_bmap_prepare_alloc_ptr(bmap, dreq);
+ ret = nilfs_bmap_prepare_alloc_ptr(bmap, dreq, dat);
if (ret < 0)
return ret;
@@ -1679,7 +1645,7 @@ nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key,
stats->bs_nblocks++;
if (nreq != NULL) {
nreq->bpr_ptr = dreq->bpr_ptr + 1;
- ret = nilfs_bmap_prepare_alloc_ptr(bmap, nreq);
+ ret = nilfs_bmap_prepare_alloc_ptr(bmap, nreq, dat);
if (ret < 0)
goto err_out_dreq;
@@ -1696,9 +1662,9 @@ nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key,
/* error */
err_out_nreq:
- nilfs_bmap_abort_alloc_ptr(bmap, nreq);
+ nilfs_bmap_abort_alloc_ptr(bmap, nreq, dat);
err_out_dreq:
- nilfs_bmap_abort_alloc_ptr(bmap, dreq);
+ nilfs_bmap_abort_alloc_ptr(bmap, dreq, dat);
stats->bs_nblocks = 0;
return ret;
@@ -1713,8 +1679,9 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap,
union nilfs_bmap_ptr_req *nreq,
struct buffer_head *bh)
{
- struct nilfs_btree *btree;
+ struct nilfs_btree *btree = (struct nilfs_btree *)bmap;
struct nilfs_btree_node *node;
+ struct inode *dat;
__u64 tmpptr;
/* free resources */
@@ -1725,11 +1692,11 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap,
set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr));
/* convert and insert */
- btree = (struct nilfs_btree *)bmap;
+ dat = NILFS_BMAP_USE_VBN(bmap) ? nilfs_bmap_get_dat(bmap) : NULL;
nilfs_btree_init(bmap);
if (nreq != NULL) {
- nilfs_bmap_commit_alloc_ptr(bmap, dreq);
- nilfs_bmap_commit_alloc_ptr(bmap, nreq);
+ nilfs_bmap_commit_alloc_ptr(bmap, dreq, dat);
+ nilfs_bmap_commit_alloc_ptr(bmap, nreq, dat);
/* create child node at level 1 */
lock_buffer(bh);
@@ -1751,7 +1718,7 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap,
nilfs_btree_node_init(btree, node, NILFS_BTREE_NODE_ROOT,
2, 1, &keys[0], &tmpptr);
} else {
- nilfs_bmap_commit_alloc_ptr(bmap, dreq);
+ nilfs_bmap_commit_alloc_ptr(bmap, dreq, dat);
/* create root node at level 1 */
node = nilfs_btree_get_root(btree);
@@ -1822,7 +1789,7 @@ static int nilfs_btree_propagate_p(struct nilfs_btree *btree,
static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree,
struct nilfs_btree_path *path,
- int level)
+ int level, struct inode *dat)
{
struct nilfs_btree_node *parent;
int ret;
@@ -1832,9 +1799,8 @@ static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree,
nilfs_btree_node_get_ptr(btree, parent,
path[level + 1].bp_index);
path[level].bp_newreq.bpr_ptr = path[level].bp_oldreq.bpr_ptr + 1;
- ret = nilfs_bmap_prepare_update_v(&btree->bt_bmap,
- &path[level].bp_oldreq,
- &path[level].bp_newreq);
+ ret = nilfs_dat_prepare_update(dat, &path[level].bp_oldreq.bpr_req,
+ &path[level].bp_newreq.bpr_req);
if (ret < 0)
return ret;
@@ -1846,9 +1812,9 @@ static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree,
&NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache,
&path[level].bp_ctxt);
if (ret < 0) {
- nilfs_bmap_abort_update_v(&btree->bt_bmap,
- &path[level].bp_oldreq,
- &path[level].bp_newreq);
+ nilfs_dat_abort_update(dat,
+ &path[level].bp_oldreq.bpr_req,
+ &path[level].bp_newreq.bpr_req);
return ret;
}
}
@@ -1858,13 +1824,13 @@ static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree,
static void nilfs_btree_commit_update_v(struct nilfs_btree *btree,
struct nilfs_btree_path *path,
- int level)
+ int level, struct inode *dat)
{
struct nilfs_btree_node *parent;
- nilfs_bmap_commit_update_v(&btree->bt_bmap,
- &path[level].bp_oldreq,
- &path[level].bp_newreq);
+ nilfs_dat_commit_update(dat, &path[level].bp_oldreq.bpr_req,
+ &path[level].bp_newreq.bpr_req,
+ btree->bt_bmap.b_ptr_type == NILFS_BMAP_PTR_VS);
if (buffer_nilfs_node(path[level].bp_bh)) {
nilfs_btnode_commit_change_key(
@@ -1881,11 +1847,10 @@ static void nilfs_btree_commit_update_v(struct nilfs_btree *btree,
static void nilfs_btree_abort_update_v(struct nilfs_btree *btree,
struct nilfs_btree_path *path,
- int level)
+ int level, struct inode *dat)
{
- nilfs_bmap_abort_update_v(&btree->bt_bmap,
- &path[level].bp_oldreq,
- &path[level].bp_newreq);
+ nilfs_dat_abort_update(dat, &path[level].bp_oldreq.bpr_req,
+ &path[level].bp_newreq.bpr_req);
if (buffer_nilfs_node(path[level].bp_bh))
nilfs_btnode_abort_change_key(
&NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache,
@@ -1894,14 +1859,14 @@ static void nilfs_btree_abort_update_v(struct nilfs_btree *btree,
static int nilfs_btree_prepare_propagate_v(struct nilfs_btree *btree,
struct nilfs_btree_path *path,
- int minlevel,
- int *maxlevelp)
+ int minlevel, int *maxlevelp,
+ struct inode *dat)
{
int level, ret;
level = minlevel;
if (!buffer_nilfs_volatile(path[level].bp_bh)) {
- ret = nilfs_btree_prepare_update_v(btree, path, level);
+ ret = nilfs_btree_prepare_update_v(btree, path, level, dat);
if (ret < 0)
return ret;
}
@@ -1909,7 +1874,7 @@ static int nilfs_btree_prepare_propagate_v(struct nilfs_btree *btree,
!buffer_dirty(path[level].bp_bh)) {
WARN_ON(buffer_nilfs_volatile(path[level].bp_bh));
- ret = nilfs_btree_prepare_update_v(btree, path, level);
+ ret = nilfs_btree_prepare_update_v(btree, path, level, dat);
if (ret < 0)
goto out;
}
@@ -1921,39 +1886,40 @@ static int nilfs_btree_prepare_propagate_v(struct nilfs_btree *btree,
/* error */
out:
while (--level > minlevel)
- nilfs_btree_abort_update_v(btree, path, level);
+ nilfs_btree_abort_update_v(btree, path, level, dat);
if (!buffer_nilfs_volatile(path[level].bp_bh))
- nilfs_btree_abort_update_v(btree, path, level);
+ nilfs_btree_abort_update_v(btree, path, level, dat);
return ret;
}
static void nilfs_btree_commit_propagate_v(struct nilfs_btree *btree,
struct nilfs_btree_path *path,
- int minlevel,
- int maxlevel,
- struct buffer_head *bh)
+ int minlevel, int maxlevel,
+ struct buffer_head *bh,
+ struct inode *dat)
{
int level;
if (!buffer_nilfs_volatile(path[minlevel].bp_bh))
- nilfs_btree_commit_update_v(btree, path, minlevel);
+ nilfs_btree_commit_update_v(btree, path, minlevel, dat);
for (level = minlevel + 1; level <= maxlevel; level++)
- nilfs_btree_commit_update_v(btree, path, level);
+ nilfs_btree_commit_update_v(btree, path, level, dat);
}
static int nilfs_btree_propagate_v(struct nilfs_btree *btree,
struct nilfs_btree_path *path,
- int level,
- struct buffer_head *bh)
+ int level, struct buffer_head *bh)
{
int maxlevel, ret;
struct nilfs_btree_node *parent;
+ struct inode *dat = nilfs_bmap_get_dat(&btree->bt_bmap);
__u64 ptr;
get_bh(bh);
path[level].bp_bh = bh;
- ret = nilfs_btree_prepare_propagate_v(btree, path, level, &maxlevel);
+ ret = nilfs_btree_prepare_propagate_v(btree, path, level, &maxlevel,
+ dat);
if (ret < 0)
goto out;
@@ -1961,12 +1927,12 @@ static int nilfs_btree_propagate_v(struct nilfs_btree *btree,
parent = nilfs_btree_get_node(btree, path, level + 1);
ptr = nilfs_btree_node_get_ptr(btree, parent,
path[level + 1].bp_index);
- ret = nilfs_bmap_mark_dirty(&btree->bt_bmap, ptr);
+ ret = nilfs_dat_mark_dirty(dat, ptr);
if (ret < 0)
goto out;
}
- nilfs_btree_commit_propagate_v(btree, path, level, maxlevel, bh);
+ nilfs_btree_commit_propagate_v(btree, path, level, maxlevel, bh, dat);
out:
brelse(path[level].bp_bh);
@@ -1986,15 +1952,15 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap,
WARN_ON(!buffer_dirty(bh));
btree = (struct nilfs_btree *)bmap;
- path = nilfs_btree_alloc_path(btree);
+ path = nilfs_btree_alloc_path();
if (path == NULL)
return -ENOMEM;
- nilfs_btree_init_path(btree, path);
+ nilfs_btree_init_path(path);
if (buffer_nilfs_node(bh)) {
node = (struct nilfs_btree_node *)bh->b_data;
- key = nilfs_btree_node_get_key(btree, node, 0);
- level = nilfs_btree_node_get_level(btree, node);
+ key = nilfs_btree_node_get_key(node, 0);
+ level = nilfs_btree_node_get_level(node);
} else {
key = nilfs_bmap_data_get_key(bmap, bh);
level = NILFS_BTREE_LEVEL_DATA;
@@ -2013,8 +1979,8 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap,
nilfs_btree_propagate_p(btree, path, level, bh);
out:
- nilfs_btree_clear_path(btree, path);
- nilfs_btree_free_path(btree, path);
+ nilfs_btree_release_path(path);
+ nilfs_btree_free_path(path);
return ret;
}
@@ -2022,7 +1988,7 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap,
static int nilfs_btree_propagate_gc(const struct nilfs_bmap *bmap,
struct buffer_head *bh)
{
- return nilfs_bmap_mark_dirty(bmap, bh->b_blocknr);
+ return nilfs_dat_mark_dirty(nilfs_bmap_get_dat(bmap), bh->b_blocknr);
}
static void nilfs_btree_add_dirty_buffer(struct nilfs_btree *btree,
@@ -2037,12 +2003,12 @@ static void nilfs_btree_add_dirty_buffer(struct nilfs_btree *btree,
get_bh(bh);
node = (struct nilfs_btree_node *)bh->b_data;
- key = nilfs_btree_node_get_key(btree, node, 0);
- level = nilfs_btree_node_get_level(btree, node);
+ key = nilfs_btree_node_get_key(node, 0);
+ level = nilfs_btree_node_get_level(node);
list_for_each(head, &lists[level]) {
cbh = list_entry(head, struct buffer_head, b_assoc_buffers);
cnode = (struct nilfs_btree_node *)cbh->b_data;
- ckey = nilfs_btree_node_get_key(btree, cnode, 0);
+ ckey = nilfs_btree_node_get_key(cnode, 0);
if (key < ckey)
break;
}
@@ -2120,8 +2086,7 @@ static int nilfs_btree_assign_p(struct nilfs_btree *btree,
nilfs_btree_node_set_ptr(btree, parent,
path[level + 1].bp_index, blocknr);
- key = nilfs_btree_node_get_key(btree, parent,
- path[level + 1].bp_index);
+ key = nilfs_btree_node_get_key(parent, path[level + 1].bp_index);
/* on-disk format */
binfo->bi_dat.bi_blkoff = nilfs_bmap_key_to_dkey(key);
binfo->bi_dat.bi_level = level;
@@ -2137,6 +2102,7 @@ static int nilfs_btree_assign_v(struct nilfs_btree *btree,
union nilfs_binfo *binfo)
{
struct nilfs_btree_node *parent;
+ struct inode *dat = nilfs_bmap_get_dat(&btree->bt_bmap);
__u64 key;
__u64 ptr;
union nilfs_bmap_ptr_req req;
@@ -2146,12 +2112,12 @@ static int nilfs_btree_assign_v(struct nilfs_btree *btree,
ptr = nilfs_btree_node_get_ptr(btree, parent,
path[level + 1].bp_index);
req.bpr_ptr = ptr;
- ret = nilfs_bmap_start_v(&btree->bt_bmap, &req, blocknr);
- if (unlikely(ret < 0))
+ ret = nilfs_dat_prepare_start(dat, &req.bpr_req);
+ if (ret < 0)
return ret;
+ nilfs_dat_commit_start(dat, &req.bpr_req, blocknr);
- key = nilfs_btree_node_get_key(btree, parent,
- path[level + 1].bp_index);
+ key = nilfs_btree_node_get_key(parent, path[level + 1].bp_index);
/* on-disk format */
binfo->bi_v.bi_vblocknr = nilfs_bmap_ptr_to_dptr(ptr);
binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key);
@@ -2171,15 +2137,15 @@ static int nilfs_btree_assign(struct nilfs_bmap *bmap,
int level, ret;
btree = (struct nilfs_btree *)bmap;
- path = nilfs_btree_alloc_path(btree);
+ path = nilfs_btree_alloc_path();
if (path == NULL)
return -ENOMEM;
- nilfs_btree_init_path(btree, path);
+ nilfs_btree_init_path(path);
if (buffer_nilfs_node(*bh)) {
node = (struct nilfs_btree_node *)(*bh)->b_data;
- key = nilfs_btree_node_get_key(btree, node, 0);
- level = nilfs_btree_node_get_level(btree, node);
+ key = nilfs_btree_node_get_key(node, 0);
+ level = nilfs_btree_node_get_level(node);
} else {
key = nilfs_bmap_data_get_key(bmap, *bh);
level = NILFS_BTREE_LEVEL_DATA;
@@ -2196,8 +2162,8 @@ static int nilfs_btree_assign(struct nilfs_bmap *bmap,
nilfs_btree_assign_p(btree, path, level, bh, blocknr, binfo);
out:
- nilfs_btree_clear_path(btree, path);
- nilfs_btree_free_path(btree, path);
+ nilfs_btree_release_path(path);
+ nilfs_btree_free_path(path);
return ret;
}
@@ -2207,19 +2173,18 @@ static int nilfs_btree_assign_gc(struct nilfs_bmap *bmap,
sector_t blocknr,
union nilfs_binfo *binfo)
{
- struct nilfs_btree *btree;
struct nilfs_btree_node *node;
__u64 key;
int ret;
- btree = (struct nilfs_btree *)bmap;
- ret = nilfs_bmap_move_v(bmap, (*bh)->b_blocknr, blocknr);
+ ret = nilfs_dat_move(nilfs_bmap_get_dat(bmap), (*bh)->b_blocknr,
+ blocknr);
if (ret < 0)
return ret;
if (buffer_nilfs_node(*bh)) {
node = (struct nilfs_btree_node *)(*bh)->b_data;
- key = nilfs_btree_node_get_key(btree, node, 0);
+ key = nilfs_btree_node_get_key(node, 0);
} else
key = nilfs_bmap_data_get_key(bmap, *bh);
@@ -2239,10 +2204,10 @@ static int nilfs_btree_mark(struct nilfs_bmap *bmap, __u64 key, int level)
int ret;
btree = (struct nilfs_btree *)bmap;
- path = nilfs_btree_alloc_path(btree);
+ path = nilfs_btree_alloc_path();
if (path == NULL)
return -ENOMEM;
- nilfs_btree_init_path(btree, path);
+ nilfs_btree_init_path(path);
ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level + 1);
if (ret < 0) {
@@ -2262,8 +2227,8 @@ static int nilfs_btree_mark(struct nilfs_bmap *bmap, __u64 key, int level)
nilfs_bmap_set_dirty(&btree->bt_bmap);
out:
- nilfs_btree_clear_path(btree, path);
- nilfs_btree_free_path(btree, path);
+ nilfs_btree_release_path(path);
+ nilfs_btree_free_path(path);
return ret;
}
diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index aec942cf79e3..1c6cfb59128d 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -815,8 +815,10 @@ int nilfs_cpfile_is_snapshot(struct inode *cpfile, __u64 cno)
void *kaddr;
int ret;
- if (cno == 0)
- return -ENOENT; /* checkpoint number 0 is invalid */
+ /* CP number is invalid if it's zero or larger than the
+ largest exist one.*/
+ if (cno == 0 || cno >= nilfs_mdt_cno(cpfile))
+ return -ENOENT;
down_read(&NILFS_MDT(cpfile)->mi_sem);
ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &bh);
@@ -824,7 +826,10 @@ int nilfs_cpfile_is_snapshot(struct inode *cpfile, __u64 cno)
goto out;
kaddr = kmap_atomic(bh->b_page, KM_USER0);
cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr);
- ret = nilfs_checkpoint_snapshot(cp);
+ if (nilfs_checkpoint_invalid(cp))
+ ret = -ENOENT;
+ else
+ ret = nilfs_checkpoint_snapshot(cp);
kunmap_atomic(kaddr, KM_USER0);
brelse(bh);
diff --git a/fs/nilfs2/cpfile.h b/fs/nilfs2/cpfile.h
index 788a45950197..debea896e701 100644
--- a/fs/nilfs2/cpfile.h
+++ b/fs/nilfs2/cpfile.h
@@ -27,8 +27,6 @@
#include <linux/buffer_head.h>
#include <linux/nilfs2_fs.h>
-#define NILFS_CPFILE_GFP NILFS_MDT_GFP
-
int nilfs_cpfile_get_checkpoint(struct inode *, __u64, int,
struct nilfs_checkpoint **,
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index 8927ca27e6f7..1ff8e15bd36b 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -109,12 +109,6 @@ void nilfs_dat_commit_free(struct inode *dat, struct nilfs_palloc_req *req)
nilfs_palloc_commit_free_entry(dat, req);
}
-void nilfs_dat_abort_free(struct inode *dat, struct nilfs_palloc_req *req)
-{
- nilfs_dat_abort_entry(dat, req);
- nilfs_palloc_abort_free_entry(dat, req);
-}
-
int nilfs_dat_prepare_start(struct inode *dat, struct nilfs_palloc_req *req)
{
int ret;
@@ -140,11 +134,6 @@ void nilfs_dat_commit_start(struct inode *dat, struct nilfs_palloc_req *req,
nilfs_dat_commit_entry(dat, req);
}
-void nilfs_dat_abort_start(struct inode *dat, struct nilfs_palloc_req *req)
-{
- nilfs_dat_abort_entry(dat, req);
-}
-
int nilfs_dat_prepare_end(struct inode *dat, struct nilfs_palloc_req *req)
{
struct nilfs_dat_entry *entry;
@@ -222,6 +211,37 @@ void nilfs_dat_abort_end(struct inode *dat, struct nilfs_palloc_req *req)
nilfs_dat_abort_entry(dat, req);
}
+int nilfs_dat_prepare_update(struct inode *dat,
+ struct nilfs_palloc_req *oldreq,
+ struct nilfs_palloc_req *newreq)
+{
+ int ret;
+
+ ret = nilfs_dat_prepare_end(dat, oldreq);
+ if (!ret) {
+ ret = nilfs_dat_prepare_alloc(dat, newreq);
+ if (ret < 0)
+ nilfs_dat_abort_end(dat, oldreq);
+ }
+ return ret;
+}
+
+void nilfs_dat_commit_update(struct inode *dat,
+ struct nilfs_palloc_req *oldreq,
+ struct nilfs_palloc_req *newreq, int dead)
+{
+ nilfs_dat_commit_end(dat, oldreq, dead);
+ nilfs_dat_commit_alloc(dat, newreq);
+}
+
+void nilfs_dat_abort_update(struct inode *dat,
+ struct nilfs_palloc_req *oldreq,
+ struct nilfs_palloc_req *newreq)
+{
+ nilfs_dat_abort_end(dat, oldreq);
+ nilfs_dat_abort_alloc(dat, newreq);
+}
+
/**
* nilfs_dat_mark_dirty -
* @dat: DAT file inode
diff --git a/fs/nilfs2/dat.h b/fs/nilfs2/dat.h
index d328b81eead4..406070d3ff49 100644
--- a/fs/nilfs2/dat.h
+++ b/fs/nilfs2/dat.h
@@ -27,7 +27,6 @@
#include <linux/buffer_head.h>
#include <linux/fs.h>
-#define NILFS_DAT_GFP NILFS_MDT_GFP
struct nilfs_palloc_req;
@@ -39,10 +38,15 @@ void nilfs_dat_abort_alloc(struct inode *, struct nilfs_palloc_req *);
int nilfs_dat_prepare_start(struct inode *, struct nilfs_palloc_req *);
void nilfs_dat_commit_start(struct inode *, struct nilfs_palloc_req *,
sector_t);
-void nilfs_dat_abort_start(struct inode *, struct nilfs_palloc_req *);
int nilfs_dat_prepare_end(struct inode *, struct nilfs_palloc_req *);
void nilfs_dat_commit_end(struct inode *, struct nilfs_palloc_req *, int);
void nilfs_dat_abort_end(struct inode *, struct nilfs_palloc_req *);
+int nilfs_dat_prepare_update(struct inode *, struct nilfs_palloc_req *,
+ struct nilfs_palloc_req *);
+void nilfs_dat_commit_update(struct inode *, struct nilfs_palloc_req *,
+ struct nilfs_palloc_req *, int);
+void nilfs_dat_abort_update(struct inode *, struct nilfs_palloc_req *,
+ struct nilfs_palloc_req *);
int nilfs_dat_mark_dirty(struct inode *, __u64);
int nilfs_dat_freev(struct inode *, __u64 *, size_t);
diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c
index 342d9765df8d..d369ac718277 100644
--- a/fs/nilfs2/direct.c
+++ b/fs/nilfs2/direct.c
@@ -125,106 +125,64 @@ static void nilfs_direct_set_target_v(struct nilfs_direct *direct,
direct->d_bmap.b_last_allocated_ptr = ptr;
}
-static int nilfs_direct_prepare_insert(struct nilfs_direct *direct,
- __u64 key,
- union nilfs_bmap_ptr_req *req,
- struct nilfs_bmap_stats *stats)
-{
- int ret;
-
- if (NILFS_BMAP_USE_VBN(&direct->d_bmap))
- req->bpr_ptr = nilfs_direct_find_target_v(direct, key);
- ret = nilfs_bmap_prepare_alloc_ptr(&direct->d_bmap, req);
- if (ret < 0)
- return ret;
-
- stats->bs_nblocks = 1;
- return 0;
-}
-
-static void nilfs_direct_commit_insert(struct nilfs_direct *direct,
- union nilfs_bmap_ptr_req *req,
- __u64 key, __u64 ptr)
-{
- struct buffer_head *bh;
-
- /* ptr must be a pointer to a buffer head. */
- bh = (struct buffer_head *)((unsigned long)ptr);
- set_buffer_nilfs_volatile(bh);
-
- nilfs_bmap_commit_alloc_ptr(&direct->d_bmap, req);
- nilfs_direct_set_ptr(direct, key, req->bpr_ptr);
-
- if (!nilfs_bmap_dirty(&direct->d_bmap))
- nilfs_bmap_set_dirty(&direct->d_bmap);
-
- if (NILFS_BMAP_USE_VBN(&direct->d_bmap))
- nilfs_direct_set_target_v(direct, key, req->bpr_ptr);
-}
-
static int nilfs_direct_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
{
- struct nilfs_direct *direct;
+ struct nilfs_direct *direct = (struct nilfs_direct *)bmap;
union nilfs_bmap_ptr_req req;
- struct nilfs_bmap_stats stats;
+ struct inode *dat = NULL;
+ struct buffer_head *bh;
int ret;
- direct = (struct nilfs_direct *)bmap;
if (key > NILFS_DIRECT_KEY_MAX)
return -ENOENT;
if (nilfs_direct_get_ptr(direct, key) != NILFS_BMAP_INVALID_PTR)
return -EEXIST;
- ret = nilfs_direct_prepare_insert(direct, key, &req, &stats);
- if (ret < 0)
- return ret;
- nilfs_direct_commit_insert(direct, &req, key, ptr);
- nilfs_bmap_add_blocks(bmap, stats.bs_nblocks);
+ if (NILFS_BMAP_USE_VBN(bmap)) {
+ req.bpr_ptr = nilfs_direct_find_target_v(direct, key);
+ dat = nilfs_bmap_get_dat(bmap);
+ }
+ ret = nilfs_bmap_prepare_alloc_ptr(bmap, &req, dat);
+ if (!ret) {
+ /* ptr must be a pointer to a buffer head. */
+ bh = (struct buffer_head *)((unsigned long)ptr);
+ set_buffer_nilfs_volatile(bh);
- return 0;
-}
+ nilfs_bmap_commit_alloc_ptr(bmap, &req, dat);
+ nilfs_direct_set_ptr(direct, key, req.bpr_ptr);
-static int nilfs_direct_prepare_delete(struct nilfs_direct *direct,
- union nilfs_bmap_ptr_req *req,
- __u64 key,
- struct nilfs_bmap_stats *stats)
-{
- int ret;
+ if (!nilfs_bmap_dirty(bmap))
+ nilfs_bmap_set_dirty(bmap);
- req->bpr_ptr = nilfs_direct_get_ptr(direct, key);
- ret = nilfs_bmap_prepare_end_ptr(&direct->d_bmap, req);
- if (!ret)
- stats->bs_nblocks = 1;
- return ret;
-}
+ if (NILFS_BMAP_USE_VBN(bmap))
+ nilfs_direct_set_target_v(direct, key, req.bpr_ptr);
-static void nilfs_direct_commit_delete(struct nilfs_direct *direct,
- union nilfs_bmap_ptr_req *req,
- __u64 key)
-{
- nilfs_bmap_commit_end_ptr(&direct->d_bmap, req);
- nilfs_direct_set_ptr(direct, key, NILFS_BMAP_INVALID_PTR);
+ nilfs_bmap_add_blocks(bmap, 1);
+ }
+ return ret;
}
static int nilfs_direct_delete(struct nilfs_bmap *bmap, __u64 key)
{
- struct nilfs_direct *direct;
+ struct nilfs_direct *direct = (struct nilfs_direct *)bmap;
union nilfs_bmap_ptr_req req;
- struct nilfs_bmap_stats stats;
+ struct inode *dat;
int ret;
- direct = (struct nilfs_direct *)bmap;
- if ((key > NILFS_DIRECT_KEY_MAX) ||
+ if (key > NILFS_DIRECT_KEY_MAX ||
nilfs_direct_get_ptr(direct, key) == NILFS_BMAP_INVALID_PTR)
return -ENOENT;
- ret = nilfs_direct_prepare_delete(direct, &req, key, &stats);
- if (ret < 0)
- return ret;
- nilfs_direct_commit_delete(direct, &req, key);
- nilfs_bmap_sub_blocks(bmap, stats.bs_nblocks);
+ dat = NILFS_BMAP_USE_VBN(bmap) ? nilfs_bmap_get_dat(bmap) : NULL;
+ req.bpr_ptr = nilfs_direct_get_ptr(direct, key);
- return 0;
+ ret = nilfs_bmap_prepare_end_ptr(bmap, &req, dat);
+ if (!ret) {
+ nilfs_bmap_commit_end_ptr(bmap, &req, dat);
+ nilfs_direct_set_ptr(direct, key, NILFS_BMAP_INVALID_PTR);
+ nilfs_bmap_sub_blocks(bmap, 1);
+ }
+ return ret;
}
static int nilfs_direct_last_key(const struct nilfs_bmap *bmap, __u64 *keyp)
@@ -310,59 +268,56 @@ int nilfs_direct_delete_and_convert(struct nilfs_bmap *bmap,
return 0;
}
-static int nilfs_direct_propagate_v(struct nilfs_direct *direct,
- struct buffer_head *bh)
+static int nilfs_direct_propagate(const struct nilfs_bmap *bmap,
+ struct buffer_head *bh)
{
- union nilfs_bmap_ptr_req oldreq, newreq;
+ struct nilfs_direct *direct = (struct nilfs_direct *)bmap;
+ struct nilfs_palloc_req oldreq, newreq;
+ struct inode *dat;
__u64 key;
__u64 ptr;
int ret;
- key = nilfs_bmap_data_get_key(&direct->d_bmap, bh);
+ if (!NILFS_BMAP_USE_VBN(bmap))
+ return 0;
+
+ dat = nilfs_bmap_get_dat(bmap);
+ key = nilfs_bmap_data_get_key(bmap, bh);
ptr = nilfs_direct_get_ptr(direct, key);
if (!buffer_nilfs_volatile(bh)) {
- oldreq.bpr_ptr = ptr;
- newreq.bpr_ptr = ptr;
- ret = nilfs_bmap_prepare_update_v(&direct->d_bmap, &oldreq,
- &newreq);
+ oldreq.pr_entry_nr = ptr;
+ newreq.pr_entry_nr = ptr;
+ ret = nilfs_dat_prepare_update(dat, &oldreq, &newreq);
if (ret < 0)
return ret;
- nilfs_bmap_commit_update_v(&direct->d_bmap, &oldreq, &newreq);
+ nilfs_dat_commit_update(dat, &oldreq, &newreq,
+ bmap->b_ptr_type == NILFS_BMAP_PTR_VS);
set_buffer_nilfs_volatile(bh);
- nilfs_direct_set_ptr(direct, key, newreq.bpr_ptr);
+ nilfs_direct_set_ptr(direct, key, newreq.pr_entry_nr);
} else
- ret = nilfs_bmap_mark_dirty(&direct->d_bmap, ptr);
+ ret = nilfs_dat_mark_dirty(dat, ptr);
return ret;
}
-static int nilfs_direct_propagate(const struct nilfs_bmap *bmap,
- struct buffer_head *bh)
-{
- struct nilfs_direct *direct = (struct nilfs_direct *)bmap;
-
- return NILFS_BMAP_USE_VBN(bmap) ?
- nilfs_direct_propagate_v(direct, bh) : 0;
-}
-
static int nilfs_direct_assign_v(struct nilfs_direct *direct,
__u64 key, __u64 ptr,
struct buffer_head **bh,
sector_t blocknr,
union nilfs_binfo *binfo)
{
+ struct inode *dat = nilfs_bmap_get_dat(&direct->d_bmap);
union nilfs_bmap_ptr_req req;
int ret;
req.bpr_ptr = ptr;
- ret = nilfs_bmap_start_v(&direct->d_bmap, &req, blocknr);
- if (unlikely(ret < 0))
- return ret;
-
- binfo->bi_v.bi_vblocknr = nilfs_bmap_ptr_to_dptr(ptr);
- binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key);
-
- return 0;
+ ret = nilfs_dat_prepare_start(dat, &req.bpr_req);
+ if (!ret) {
+ nilfs_dat_commit_start(dat, &req.bpr_req, blocknr);
+ binfo->bi_v.bi_vblocknr = nilfs_bmap_ptr_to_dptr(ptr);
+ binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key);
+ }
+ return ret;
}
static int nilfs_direct_assign_p(struct nilfs_direct *direct,
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index 6bd84a0d8238..fc8278c77cdd 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -151,7 +151,7 @@ struct file_operations nilfs_file_operations = {
.splice_read = generic_file_splice_read,
};
-struct inode_operations nilfs_file_inode_operations = {
+const struct inode_operations nilfs_file_inode_operations = {
.truncate = nilfs_truncate,
.setattr = nilfs_setattr,
.permission = nilfs_permission,
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index 1b3c2bb20da9..e6de0a27ab5d 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -52,7 +52,7 @@
#include "dat.h"
#include "ifile.h"
-static struct address_space_operations def_gcinode_aops = {
+static const struct address_space_operations def_gcinode_aops = {
.sync_page = block_sync_page,
};
diff --git a/fs/nilfs2/ifile.h b/fs/nilfs2/ifile.h
index 5d30a35679b5..ecc3ba76db47 100644
--- a/fs/nilfs2/ifile.h
+++ b/fs/nilfs2/ifile.h
@@ -31,7 +31,6 @@
#include "mdt.h"
#include "alloc.h"
-#define NILFS_IFILE_GFP NILFS_MDT_GFP
static inline struct nilfs_inode *
nilfs_ifile_map_inode(struct inode *ifile, ino_t ino, struct buffer_head *ibh)
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index fe9d8f2a13f8..2d2c501deb54 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -238,7 +238,7 @@ nilfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
return size;
}
-struct address_space_operations nilfs_aops = {
+const struct address_space_operations nilfs_aops = {
.writepage = nilfs_writepage,
.readpage = nilfs_readpage,
.sync_page = block_sync_page,
@@ -430,7 +430,8 @@ static int __nilfs_read_inode(struct super_block *sb, unsigned long ino,
raw_inode = nilfs_ifile_map_inode(sbi->s_ifile, ino, bh);
- if (nilfs_read_inode_common(inode, raw_inode))
+ err = nilfs_read_inode_common(inode, raw_inode);
+ if (err)
goto failed_unmap;
if (S_ISREG(inode->i_mode)) {
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index 6ea5f872e2de..6572ea4bc4df 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -442,12 +442,6 @@ int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *nilfs,
const char *msg;
int ret;
- ret = nilfs_ioctl_move_blocks(nilfs, &argv[0], kbufs[0]);
- if (ret < 0) {
- msg = "cannot read source blocks";
- goto failed;
- }
-
ret = nilfs_ioctl_delete_checkpoints(nilfs, &argv[1], kbufs[1]);
if (ret < 0) {
/*
@@ -548,7 +542,25 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
}
}
- ret = nilfs_clean_segments(inode->i_sb, argv, kbufs);
+ /*
+ * nilfs_ioctl_move_blocks() will call nilfs_gc_iget(),
+ * which will operates an inode list without blocking.
+ * To protect the list from concurrent operations,
+ * nilfs_ioctl_move_blocks should be atomic operation.
+ */
+ if (test_and_set_bit(THE_NILFS_GC_RUNNING, &nilfs->ns_flags)) {
+ ret = -EBUSY;
+ goto out_free;
+ }
+
+ ret = nilfs_ioctl_move_blocks(nilfs, &argv[0], kbufs[0]);
+ if (ret < 0)
+ printk(KERN_ERR "NILFS: GC failed during preparation: "
+ "cannot read source blocks: err=%d\n", ret);
+ else
+ ret = nilfs_clean_segments(inode->i_sb, argv, kbufs);
+
+ clear_nilfs_gc_running(nilfs);
out_free:
while (--n >= 0)
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index 2dfd47714ae5..b18c4998f8d0 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -103,15 +103,12 @@ static int nilfs_mdt_create_block(struct inode *inode, unsigned long block,
goto failed_unlock;
err = -EEXIST;
- if (buffer_uptodate(bh) || buffer_mapped(bh))
+ if (buffer_uptodate(bh))
goto failed_bh;
-#if 0
- /* The uptodate flag is not protected by the page lock, but
- the mapped flag is. Thus, we don't have to wait the buffer. */
+
wait_on_buffer(bh);
if (buffer_uptodate(bh))
goto failed_bh;
-#endif
bh->b_bdev = nilfs->ns_bdev;
err = nilfs_mdt_insert_new_block(inode, block, bh, init_block);
@@ -139,7 +136,7 @@ nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff,
int mode, struct buffer_head **out_bh)
{
struct buffer_head *bh;
- unsigned long blknum = 0;
+ __u64 blknum = 0;
int ret = -ENOMEM;
bh = nilfs_grab_buffer(inode, inode->i_mapping, blkoff, 0);
@@ -162,17 +159,15 @@ nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff,
unlock_buffer(bh);
goto out;
}
- if (!buffer_mapped(bh)) { /* unused buffer */
- ret = nilfs_bmap_lookup(NILFS_I(inode)->i_bmap, blkoff,
- &blknum);
- if (unlikely(ret)) {
- unlock_buffer(bh);
- goto failed_bh;
- }
- bh->b_bdev = NILFS_MDT(inode)->mi_nilfs->ns_bdev;
- bh->b_blocknr = blknum;
- set_buffer_mapped(bh);
+
+ ret = nilfs_bmap_lookup(NILFS_I(inode)->i_bmap, blkoff, &blknum);
+ if (unlikely(ret)) {
+ unlock_buffer(bh);
+ goto failed_bh;
}
+ bh->b_bdev = NILFS_MDT(inode)->mi_nilfs->ns_bdev;
+ bh->b_blocknr = (sector_t)blknum;
+ set_buffer_mapped(bh);
bh->b_end_io = end_buffer_read_sync;
get_bh(bh);
@@ -402,6 +397,7 @@ nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc)
struct inode *inode = container_of(page->mapping,
struct inode, i_data);
struct super_block *sb = inode->i_sb;
+ struct the_nilfs *nilfs = NILFS_MDT(inode)->mi_nilfs;
struct nilfs_sb_info *writer = NULL;
int err = 0;
@@ -411,9 +407,10 @@ nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc)
if (page->mapping->assoc_mapping)
return 0; /* Do not request flush for shadow page cache */
if (!sb) {
- writer = nilfs_get_writer(NILFS_MDT(inode)->mi_nilfs);
+ down_read(&nilfs->ns_writer_sem);
+ writer = nilfs->ns_writer;
if (!writer) {
- nilfs_put_writer(NILFS_MDT(inode)->mi_nilfs);
+ up_read(&nilfs->ns_writer_sem);
return -EROFS;
}
sb = writer->s_super;
@@ -425,17 +422,17 @@ nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc)
nilfs_flush_segment(sb, inode->i_ino);
if (writer)
- nilfs_put_writer(NILFS_MDT(inode)->mi_nilfs);
+ up_read(&nilfs->ns_writer_sem);
return err;
}
-static struct address_space_operations def_mdt_aops = {
+static const struct address_space_operations def_mdt_aops = {
.writepage = nilfs_mdt_write_page,
.sync_page = block_sync_page,
};
-static struct inode_operations def_mdt_iops;
+static const struct inode_operations def_mdt_iops;
static struct file_operations def_mdt_fops;
/*
@@ -516,9 +513,10 @@ nilfs_mdt_new_common(struct the_nilfs *nilfs, struct super_block *sb,
}
struct inode *nilfs_mdt_new(struct the_nilfs *nilfs, struct super_block *sb,
- ino_t ino, gfp_t gfp_mask)
+ ino_t ino)
{
- struct inode *inode = nilfs_mdt_new_common(nilfs, sb, ino, gfp_mask);
+ struct inode *inode = nilfs_mdt_new_common(nilfs, sb, ino,
+ NILFS_MDT_GFP);
if (!inode)
return NULL;
diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h
index df683e0bca6a..431599733c9b 100644
--- a/fs/nilfs2/mdt.h
+++ b/fs/nilfs2/mdt.h
@@ -74,8 +74,7 @@ int nilfs_mdt_forget_block(struct inode *, unsigned long);
int nilfs_mdt_mark_block_dirty(struct inode *, unsigned long);
int nilfs_mdt_fetch_dirty(struct inode *);
-struct inode *nilfs_mdt_new(struct the_nilfs *, struct super_block *, ino_t,
- gfp_t);
+struct inode *nilfs_mdt_new(struct the_nilfs *, struct super_block *, ino_t);
struct inode *nilfs_mdt_new_common(struct the_nilfs *, struct super_block *,
ino_t, gfp_t);
void nilfs_mdt_destroy(struct inode *);
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index df70dadb336f..ed02e886fa79 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -448,7 +448,7 @@ out:
return err;
}
-struct inode_operations nilfs_dir_inode_operations = {
+const struct inode_operations nilfs_dir_inode_operations = {
.create = nilfs_create,
.lookup = nilfs_lookup,
.link = nilfs_link,
@@ -462,12 +462,12 @@ struct inode_operations nilfs_dir_inode_operations = {
.permission = nilfs_permission,
};
-struct inode_operations nilfs_special_inode_operations = {
+const struct inode_operations nilfs_special_inode_operations = {
.setattr = nilfs_setattr,
.permission = nilfs_permission,
};
-struct inode_operations nilfs_symlink_inode_operations = {
+const struct inode_operations nilfs_symlink_inode_operations = {
.readlink = generic_readlink,
.follow_link = page_follow_link_light,
.put_link = page_put_link,
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 724c63766e82..bad7368782d0 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -295,12 +295,12 @@ void nilfs_clear_gcdat_inode(struct the_nilfs *);
* Inodes and files operations
*/
extern struct file_operations nilfs_dir_operations;
-extern struct inode_operations nilfs_file_inode_operations;
+extern const struct inode_operations nilfs_file_inode_operations;
extern struct file_operations nilfs_file_operations;
-extern struct address_space_operations nilfs_aops;
-extern struct inode_operations nilfs_dir_inode_operations;
-extern struct inode_operations nilfs_special_inode_operations;
-extern struct inode_operations nilfs_symlink_inode_operations;
+extern const struct address_space_operations nilfs_aops;
+extern const struct inode_operations nilfs_dir_inode_operations;
+extern const struct inode_operations nilfs_special_inode_operations;
+extern const struct inode_operations nilfs_symlink_inode_operations;
/*
* filesystem type
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index d80cc71be749..6dc83591d118 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -552,7 +552,8 @@ static int recover_dsync_blocks(struct nilfs_sb_info *sbi,
printk(KERN_WARNING
"NILFS warning: error recovering data block "
"(err=%d, ino=%lu, block-offset=%llu)\n",
- err, rb->ino, (unsigned long long)rb->blkoff);
+ err, (unsigned long)rb->ino,
+ (unsigned long long)rb->blkoff);
if (!err2)
err2 = err;
next:
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index 9e3fe17bb96b..e6d9e37fa241 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -316,10 +316,10 @@ static struct bio *nilfs_alloc_seg_bio(struct super_block *sb, sector_t start,
{
struct bio *bio;
- bio = bio_alloc(GFP_NOWAIT, nr_vecs);
+ bio = bio_alloc(GFP_NOIO, nr_vecs);
if (bio == NULL) {
while (!bio && (nr_vecs >>= 1))
- bio = bio_alloc(GFP_NOWAIT, nr_vecs);
+ bio = bio_alloc(GFP_NOIO, nr_vecs);
}
if (likely(bio)) {
bio->bi_bdev = sb->s_bdev;
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 51ff3d0a4ee2..683df89dbae5 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -2501,7 +2501,8 @@ static int nilfs_segctor_construct(struct nilfs_sc_info *sci,
if (test_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags) &&
nilfs_discontinued(nilfs)) {
down_write(&nilfs->ns_sem);
- req->sb_err = nilfs_commit_super(sbi, 0);
+ req->sb_err = nilfs_commit_super(sbi,
+ nilfs_altsb_need_update(nilfs));
up_write(&nilfs->ns_sem);
}
}
@@ -2689,6 +2690,7 @@ static int nilfs_segctor_thread(void *arg)
} else {
DEFINE_WAIT(wait);
int should_sleep = 1;
+ struct the_nilfs *nilfs;
prepare_to_wait(&sci->sc_wait_daemon, &wait,
TASK_INTERRUPTIBLE);
@@ -2709,6 +2711,9 @@ static int nilfs_segctor_thread(void *arg)
finish_wait(&sci->sc_wait_daemon, &wait);
timeout = ((sci->sc_state & NILFS_SEGCTOR_COMMIT) &&
time_after_eq(jiffies, sci->sc_timer->expires));
+ nilfs = sci->sc_sbi->s_nilfs;
+ if (sci->sc_super->s_dirt && nilfs_sb_need_update(nilfs))
+ set_nilfs_discontinued(nilfs);
}
goto loop;
diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h
index a2c4d76c3366..0e99e5c0bd0f 100644
--- a/fs/nilfs2/sufile.h
+++ b/fs/nilfs2/sufile.h
@@ -28,7 +28,6 @@
#include <linux/nilfs2_fs.h>
#include "mdt.h"
-#define NILFS_SUFILE_GFP NILFS_MDT_GFP
static inline unsigned long nilfs_sufile_get_nsegments(struct inode *sufile)
{
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 8e2ec43b18f4..644e66727dd0 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -50,6 +50,8 @@
#include <linux/writeback.h>
#include <linux/kobject.h>
#include <linux/exportfs.h>
+#include <linux/seq_file.h>
+#include <linux/mount.h>
#include "nilfs.h"
#include "mdt.h"
#include "alloc.h"
@@ -65,7 +67,6 @@ MODULE_DESCRIPTION("A New Implementation of the Log-structured Filesystem "
"(NILFS)");
MODULE_LICENSE("GPL");
-static void nilfs_write_super(struct super_block *sb);
static int nilfs_remount(struct super_block *sb, int *flags, char *data);
/**
@@ -311,9 +312,6 @@ static void nilfs_put_super(struct super_block *sb)
lock_kernel();
- if (sb->s_dirt)
- nilfs_write_super(sb);
-
nilfs_detach_segment_constructor(sbi);
if (!(sb->s_flags & MS_RDONLY)) {
@@ -336,63 +334,21 @@ static void nilfs_put_super(struct super_block *sb)
unlock_kernel();
}
-/**
- * nilfs_write_super - write super block(s) of NILFS
- * @sb: super_block
- *
- * nilfs_write_super() gets a fs-dependent lock, writes super block(s), and
- * clears s_dirt. This function is called in the section protected by
- * lock_super().
- *
- * The s_dirt flag is managed by each filesystem and we protect it by ns_sem
- * of the struct the_nilfs. Lock order must be as follows:
- *
- * 1. lock_super()
- * 2. down_write(&nilfs->ns_sem)
- *
- * Inside NILFS, locking ns_sem is enough to protect s_dirt and the buffer
- * of the super block (nilfs->ns_sbp[]).
- *
- * In most cases, VFS functions call lock_super() before calling these
- * methods. So we must be careful not to bring on deadlocks when using
- * lock_super(); see generic_shutdown_super(), write_super(), and so on.
- *
- * Note that order of lock_kernel() and lock_super() depends on contexts
- * of VFS. We should also note that lock_kernel() can be used in its
- * protective section and only the outermost one has an effect.
- */
-static void nilfs_write_super(struct super_block *sb)
+static int nilfs_sync_fs(struct super_block *sb, int wait)
{
struct nilfs_sb_info *sbi = NILFS_SB(sb);
struct the_nilfs *nilfs = sbi->s_nilfs;
-
- down_write(&nilfs->ns_sem);
- if (!(sb->s_flags & MS_RDONLY)) {
- struct nilfs_super_block **sbp = nilfs->ns_sbp;
- u64 t = get_seconds();
- int dupsb;
-
- if (!nilfs_discontinued(nilfs) && t >= nilfs->ns_sbwtime[0] &&
- t < nilfs->ns_sbwtime[0] + NILFS_SB_FREQ) {
- up_write(&nilfs->ns_sem);
- return;
- }
- dupsb = sbp[1] && t > nilfs->ns_sbwtime[1] + NILFS_ALTSB_FREQ;
- nilfs_commit_super(sbi, dupsb);
- }
- sb->s_dirt = 0;
- up_write(&nilfs->ns_sem);
-}
-
-static int nilfs_sync_fs(struct super_block *sb, int wait)
-{
int err = 0;
- nilfs_write_super(sb);
-
/* This function is called when super block should be written back */
if (wait)
err = nilfs_construct_segment(sb);
+
+ down_write(&nilfs->ns_sem);
+ if (sb->s_dirt)
+ nilfs_commit_super(sbi, 1);
+ up_write(&nilfs->ns_sem);
+
return err;
}
@@ -407,8 +363,7 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno)
list_add(&sbi->s_list, &nilfs->ns_supers);
up_write(&nilfs->ns_super_sem);
- sbi->s_ifile = nilfs_mdt_new(
- nilfs, sbi->s_super, NILFS_IFILE_INO, NILFS_IFILE_GFP);
+ sbi->s_ifile = nilfs_mdt_new(nilfs, sbi->s_super, NILFS_IFILE_INO);
if (!sbi->s_ifile)
return -ENOMEM;
@@ -416,8 +371,10 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno)
if (unlikely(err))
goto failed;
+ down_read(&nilfs->ns_segctor_sem);
err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, cno, 0, &raw_cp,
&bh_cp);
+ up_read(&nilfs->ns_segctor_sem);
if (unlikely(err)) {
if (err == -ENOENT || err == -EINVAL) {
printk(KERN_ERR
@@ -527,7 +484,27 @@ static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf)
return 0;
}
-static struct super_operations nilfs_sops = {
+static int nilfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
+{
+ struct super_block *sb = vfs->mnt_sb;
+ struct nilfs_sb_info *sbi = NILFS_SB(sb);
+
+ if (!nilfs_test_opt(sbi, BARRIER))
+ seq_printf(seq, ",barrier=off");
+ if (nilfs_test_opt(sbi, SNAPSHOT))
+ seq_printf(seq, ",cp=%llu",
+ (unsigned long long int)sbi->s_snapshot_cno);
+ if (nilfs_test_opt(sbi, ERRORS_RO))
+ seq_printf(seq, ",errors=remount-ro");
+ if (nilfs_test_opt(sbi, ERRORS_PANIC))
+ seq_printf(seq, ",errors=panic");
+ if (nilfs_test_opt(sbi, STRICT_ORDER))
+ seq_printf(seq, ",order=strict");
+
+ return 0;
+}
+
+static const struct super_operations nilfs_sops = {
.alloc_inode = nilfs_alloc_inode,
.destroy_inode = nilfs_destroy_inode,
.dirty_inode = nilfs_dirty_inode,
@@ -536,7 +513,7 @@ static struct super_operations nilfs_sops = {
/* .drop_inode = nilfs_drop_inode, */
.delete_inode = nilfs_delete_inode,
.put_super = nilfs_put_super,
- .write_super = nilfs_write_super,
+ /* .write_super = nilfs_write_super, */
.sync_fs = nilfs_sync_fs,
/* .write_super_lockfs */
/* .unlockfs */
@@ -544,7 +521,7 @@ static struct super_operations nilfs_sops = {
.remount_fs = nilfs_remount,
.clear_inode = nilfs_clear_inode,
/* .umount_begin */
- /* .show_options */
+ .show_options = nilfs_show_options
};
static struct inode *
@@ -583,7 +560,7 @@ nilfs_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len,
nilfs_nfs_get_inode);
}
-static struct export_operations nilfs_export_ops = {
+static const struct export_operations nilfs_export_ops = {
.fh_to_dentry = nilfs_fh_to_dentry,
.fh_to_parent = nilfs_fh_to_parent,
.get_parent = nilfs_get_parent,
@@ -814,10 +791,15 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
if (sb->s_flags & MS_RDONLY) {
if (nilfs_test_opt(sbi, SNAPSHOT)) {
+ down_read(&nilfs->ns_segctor_sem);
err = nilfs_cpfile_is_snapshot(nilfs->ns_cpfile,
sbi->s_snapshot_cno);
- if (err < 0)
+ up_read(&nilfs->ns_segctor_sem);
+ if (err < 0) {
+ if (err == -ENOENT)
+ err = -EINVAL;
goto failed_sbi;
+ }
if (!err) {
printk(KERN_ERR
"NILFS: The specified checkpoint is "
@@ -1125,10 +1107,6 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
*/
sd.sbi = nilfs_find_sbinfo(nilfs, !(flags & MS_RDONLY), sd.cno);
- if (!sd.cno)
- /* trying to get the latest checkpoint. */
- sd.cno = nilfs_last_cno(nilfs);
-
/*
* Get super block instance holding the nilfs_sb_info struct.
* A new instance is allocated if no existing mount is present or
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 8b8889825716..ad391a8c3e7e 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -68,12 +68,11 @@ static struct the_nilfs *alloc_nilfs(struct block_device *bdev)
nilfs->ns_bdev = bdev;
atomic_set(&nilfs->ns_count, 1);
- atomic_set(&nilfs->ns_writer_refcount, -1);
atomic_set(&nilfs->ns_ndirtyblks, 0);
init_rwsem(&nilfs->ns_sem);
init_rwsem(&nilfs->ns_super_sem);
mutex_init(&nilfs->ns_mount_mutex);
- mutex_init(&nilfs->ns_writer_mutex);
+ init_rwsem(&nilfs->ns_writer_sem);
INIT_LIST_HEAD(&nilfs->ns_list);
INIT_LIST_HEAD(&nilfs->ns_supers);
spin_lock_init(&nilfs->ns_last_segment_lock);
@@ -188,23 +187,19 @@ static int nilfs_load_super_root(struct the_nilfs *nilfs,
inode_size = nilfs->ns_inode_size;
err = -ENOMEM;
- nilfs->ns_dat = nilfs_mdt_new(
- nilfs, NULL, NILFS_DAT_INO, NILFS_DAT_GFP);
+ nilfs->ns_dat = nilfs_mdt_new(nilfs, NULL, NILFS_DAT_INO);
if (unlikely(!nilfs->ns_dat))
goto failed;
- nilfs->ns_gc_dat = nilfs_mdt_new(
- nilfs, NULL, NILFS_DAT_INO, NILFS_DAT_GFP);
+ nilfs->ns_gc_dat = nilfs_mdt_new(nilfs, NULL, NILFS_DAT_INO);
if (unlikely(!nilfs->ns_gc_dat))
goto failed_dat;
- nilfs->ns_cpfile = nilfs_mdt_new(
- nilfs, NULL, NILFS_CPFILE_INO, NILFS_CPFILE_GFP);
+ nilfs->ns_cpfile = nilfs_mdt_new(nilfs, NULL, NILFS_CPFILE_INO);
if (unlikely(!nilfs->ns_cpfile))
goto failed_gc_dat;
- nilfs->ns_sufile = nilfs_mdt_new(
- nilfs, NULL, NILFS_SUFILE_INO, NILFS_SUFILE_GFP);
+ nilfs->ns_sufile = nilfs_mdt_new(nilfs, NULL, NILFS_SUFILE_INO);
if (unlikely(!nilfs->ns_sufile))
goto failed_cpfile;
@@ -596,9 +591,7 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
nilfs->ns_mount_state = le16_to_cpu(sbp->s_state);
- bdi = nilfs->ns_bdev->bd_inode_backing_dev_info;
- if (!bdi)
- bdi = nilfs->ns_bdev->bd_inode->i_mapping->backing_dev_info;
+ bdi = nilfs->ns_bdev->bd_inode->i_mapping->backing_dev_info;
nilfs->ns_bdi = bdi ? : &default_backing_dev_info;
/* Finding last segment */
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index e8adbffc626f..20abd55881e0 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -37,6 +37,7 @@ enum {
THE_NILFS_LOADED, /* Roll-back/roll-forward has done and
the latest checkpoint was loaded */
THE_NILFS_DISCONTINUED, /* 'next' pointer chain has broken */
+ THE_NILFS_GC_RUNNING, /* gc process is running */
};
/**
@@ -50,8 +51,7 @@ enum {
* @ns_sem: semaphore for shared states
* @ns_super_sem: semaphore for global operations across super block instances
* @ns_mount_mutex: mutex protecting mount process of nilfs
- * @ns_writer_mutex: mutex protecting ns_writer attach/detach
- * @ns_writer_refcount: number of referrers on ns_writer
+ * @ns_writer_sem: semaphore protecting ns_writer attach/detach
* @ns_current: back pointer to current mount
* @ns_sbh: buffer heads of on-disk super blocks
* @ns_sbp: pointers to super block data
@@ -100,8 +100,7 @@ struct the_nilfs {
struct rw_semaphore ns_sem;
struct rw_semaphore ns_super_sem;
struct mutex ns_mount_mutex;
- struct mutex ns_writer_mutex;
- atomic_t ns_writer_refcount;
+ struct rw_semaphore ns_writer_sem;
/*
* components protected by ns_super_sem
@@ -197,11 +196,26 @@ static inline int nilfs_##name(struct the_nilfs *nilfs) \
THE_NILFS_FNS(INIT, init)
THE_NILFS_FNS(LOADED, loaded)
THE_NILFS_FNS(DISCONTINUED, discontinued)
+THE_NILFS_FNS(GC_RUNNING, gc_running)
/* Minimum interval of periodical update of superblocks (in seconds) */
#define NILFS_SB_FREQ 10
#define NILFS_ALTSB_FREQ 60 /* spare superblock */
+static inline int nilfs_sb_need_update(struct the_nilfs *nilfs)
+{
+ u64 t = get_seconds();
+ return t < nilfs->ns_sbwtime[0] ||
+ t > nilfs->ns_sbwtime[0] + NILFS_SB_FREQ;
+}
+
+static inline int nilfs_altsb_need_update(struct the_nilfs *nilfs)
+{
+ u64 t = get_seconds();
+ struct nilfs_super_block **sbp = nilfs->ns_sbp;
+ return sbp[1] && t > nilfs->ns_sbwtime[1] + NILFS_ALTSB_FREQ;
+}
+
void nilfs_set_last_segment(struct the_nilfs *, sector_t, u64, __u64);
struct the_nilfs *find_or_create_nilfs(struct block_device *);
void put_nilfs(struct the_nilfs *);
@@ -221,39 +235,26 @@ static inline void get_nilfs(struct the_nilfs *nilfs)
atomic_inc(&nilfs->ns_count);
}
-static inline struct nilfs_sb_info *nilfs_get_writer(struct the_nilfs *nilfs)
-{
- if (atomic_inc_and_test(&nilfs->ns_writer_refcount))
- mutex_lock(&nilfs->ns_writer_mutex);
- return nilfs->ns_writer;
-}
-
-static inline void nilfs_put_writer(struct the_nilfs *nilfs)
-{
- if (atomic_add_negative(-1, &nilfs->ns_writer_refcount))
- mutex_unlock(&nilfs->ns_writer_mutex);
-}
-
static inline void
nilfs_attach_writer(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
{
- mutex_lock(&nilfs->ns_writer_mutex);
+ down_write(&nilfs->ns_writer_sem);
nilfs->ns_writer = sbi;
- mutex_unlock(&nilfs->ns_writer_mutex);
+ up_write(&nilfs->ns_writer_sem);
}
static inline void
nilfs_detach_writer(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
{
- mutex_lock(&nilfs->ns_writer_mutex);
+ down_write(&nilfs->ns_writer_sem);
if (sbi == nilfs->ns_writer)
nilfs->ns_writer = NULL;
- mutex_unlock(&nilfs->ns_writer_mutex);
+ up_write(&nilfs->ns_writer_sem);
}
static inline void nilfs_put_sbinfo(struct nilfs_sb_info *sbi)
{
- if (!atomic_dec_and_test(&sbi->s_count))
+ if (atomic_dec_and_test(&sbi->s_count))
kfree(sbi);
}
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index 47cd258fd24d..c9ee67b442e1 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -62,13 +62,14 @@ static int inotify_handle_event(struct fsnotify_group *group, struct fsnotify_ev
event_priv->wd = wd;
ret = fsnotify_add_notify_event(group, event, fsn_event_priv);
- /* EEXIST is not an error */
- if (ret == -EEXIST)
- ret = 0;
-
- /* did event_priv get attached? */
- if (list_empty(&fsn_event_priv->event_list))
+ if (ret) {
inotify_free_event_priv(fsn_event_priv);
+ /* EEXIST says we tail matched, EOVERFLOW isn't something
+ * to report up the stack. */
+ if ((ret == -EEXIST) ||
+ (ret == -EOVERFLOW))
+ ret = 0;
+ }
/*
* If we hold the entry until after the event is on the queue
@@ -104,16 +105,45 @@ static bool inotify_should_send_event(struct fsnotify_group *group, struct inode
return send;
}
+/*
+ * This is NEVER supposed to be called. Inotify marks should either have been
+ * removed from the idr when the watch was removed or in the
+ * fsnotify_destroy_mark_by_group() call when the inotify instance was being
+ * torn down. This is only called if the idr is about to be freed but there
+ * are still marks in it.
+ */
static int idr_callback(int id, void *p, void *data)
{
- BUG();
+ struct fsnotify_mark_entry *entry;
+ struct inotify_inode_mark_entry *ientry;
+ static bool warned = false;
+
+ if (warned)
+ return 0;
+
+ warned = false;
+ entry = p;
+ ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry);
+
+ WARN(1, "inotify closing but id=%d for entry=%p in group=%p still in "
+ "idr. Probably leaking memory\n", id, p, data);
+
+ /*
+ * I'm taking the liberty of assuming that the mark in question is a
+ * valid address and I'm dereferencing it. This might help to figure
+ * out why we got here and the panic is no worse than the original
+ * BUG() that was here.
+ */
+ if (entry)
+ printk(KERN_WARNING "entry->group=%p inode=%p wd=%d\n",
+ entry->group, entry->inode, ientry->wd);
return 0;
}
static void inotify_free_group_priv(struct fsnotify_group *group)
{
/* ideally the idr is empty and we won't hit the BUG in teh callback */
- idr_for_each(&group->inotify_data.idr, idr_callback, NULL);
+ idr_for_each(&group->inotify_data.idr, idr_callback, group);
idr_remove_all(&group->inotify_data.idr);
idr_destroy(&group->inotify_data.idr);
}
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index f30d9bbc2e1b..dcd2040d330c 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -47,9 +47,6 @@
static struct vfsmount *inotify_mnt __read_mostly;
-/* this just sits here and wastes global memory. used to just pad userspace messages with zeros */
-static struct inotify_event nul_inotify_event;
-
/* these are configurable via /proc/sys/fs/inotify/ */
static int inotify_max_user_instances __read_mostly;
static int inotify_max_queued_events __read_mostly;
@@ -157,7 +154,8 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
event = fsnotify_peek_notify_event(group);
- event_size += roundup(event->name_len, event_size);
+ if (event->name_len)
+ event_size += roundup(event->name_len + 1, event_size);
if (event_size > count)
return ERR_PTR(-EINVAL);
@@ -183,7 +181,7 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
struct fsnotify_event_private_data *fsn_priv;
struct inotify_event_private_data *priv;
size_t event_size = sizeof(struct inotify_event);
- size_t name_len;
+ size_t name_len = 0;
/* we get the inotify watch descriptor from the event private data */
spin_lock(&event->lock);
@@ -199,8 +197,12 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
inotify_free_event_priv(fsn_priv);
}
- /* round up event->name_len so it is a multiple of event_size */
- name_len = roundup(event->name_len, event_size);
+ /*
+ * round up event->name_len so it is a multiple of event_size
+ * plus an extra byte for the terminating '\0'.
+ */
+ if (event->name_len)
+ name_len = roundup(event->name_len + 1, event_size);
inotify_event.len = name_len;
inotify_event.mask = inotify_mask_to_arg(event->mask);
@@ -224,8 +226,8 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
return -EFAULT;
buf += event->name_len;
- /* fill userspace with 0's from nul_inotify_event */
- if (copy_to_user(buf, &nul_inotify_event, len_to_zero))
+ /* fill userspace with 0's */
+ if (clear_user(buf, len_to_zero))
return -EFAULT;
buf += len_to_zero;
event_size += name_len;
@@ -326,8 +328,9 @@ static long inotify_ioctl(struct file *file, unsigned int cmd,
list_for_each_entry(holder, &group->notification_list, event_list) {
event = holder->event;
send_len += sizeof(struct inotify_event);
- send_len += roundup(event->name_len,
- sizeof(struct inotify_event));
+ if (event->name_len)
+ send_len += roundup(event->name_len + 1,
+ sizeof(struct inotify_event));
}
mutex_unlock(&group->notification_mutex);
ret = put_user(send_len, (int __user *) p);
@@ -364,20 +367,53 @@ static int inotify_find_inode(const char __user *dirname, struct path *path, uns
return error;
}
+/*
+ * Remove the mark from the idr (if present) and drop the reference
+ * on the mark because it was in the idr.
+ */
static void inotify_remove_from_idr(struct fsnotify_group *group,
struct inotify_inode_mark_entry *ientry)
{
struct idr *idr;
+ struct fsnotify_mark_entry *entry;
+ struct inotify_inode_mark_entry *found_ientry;
+ int wd;
spin_lock(&group->inotify_data.idr_lock);
idr = &group->inotify_data.idr;
- idr_remove(idr, ientry->wd);
- spin_unlock(&group->inotify_data.idr_lock);
+ wd = ientry->wd;
+
+ if (wd == -1)
+ goto out;
+
+ entry = idr_find(&group->inotify_data.idr, wd);
+ if (unlikely(!entry))
+ goto out;
+
+ found_ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry);
+ if (unlikely(found_ientry != ientry)) {
+ /* We found an entry in the idr with the right wd, but it's
+ * not the entry we were told to remove. eparis seriously
+ * fucked up somewhere. */
+ WARN_ON(1);
+ ientry->wd = -1;
+ goto out;
+ }
+
+ /* One ref for being in the idr, one ref held by the caller */
+ BUG_ON(atomic_read(&entry->refcnt) < 2);
+
+ idr_remove(idr, wd);
ientry->wd = -1;
+
+ /* removed from the idr, drop that ref */
+ fsnotify_put_mark(entry);
+out:
+ spin_unlock(&group->inotify_data.idr_lock);
}
+
/*
- * Send IN_IGNORED for this wd, remove this wd from the idr, and drop the
- * internal reference help on the mark because it is in the idr.
+ * Send IN_IGNORED for this wd, remove this wd from the idr.
*/
void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry,
struct fsnotify_group *group)
@@ -386,6 +422,7 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry,
struct fsnotify_event *ignored_event;
struct inotify_event_private_data *event_priv;
struct fsnotify_event_private_data *fsn_event_priv;
+ int ret;
ignored_event = fsnotify_create_event(NULL, FS_IN_IGNORED, NULL,
FSNOTIFY_EVENT_NONE, NULL, 0,
@@ -404,10 +441,8 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry,
fsn_event_priv->group = group;
event_priv->wd = ientry->wd;
- fsnotify_add_notify_event(group, ignored_event, fsn_event_priv);
-
- /* did the private data get added? */
- if (list_empty(&fsn_event_priv->event_list))
+ ret = fsnotify_add_notify_event(group, ignored_event, fsn_event_priv);
+ if (ret)
inotify_free_event_priv(fsn_event_priv);
skip_send_ignore:
@@ -418,9 +453,6 @@ skip_send_ignore:
/* remove this entry from the idr */
inotify_remove_from_idr(group, ientry);
- /* removed from idr, drop that reference */
- fsnotify_put_mark(entry);
-
atomic_dec(&group->inotify_data.user->inotify_watches);
}
@@ -432,80 +464,29 @@ static void inotify_free_mark(struct fsnotify_mark_entry *entry)
kmem_cache_free(inotify_inode_mark_cachep, ientry);
}
-static int inotify_update_watch(struct fsnotify_group *group, struct inode *inode, u32 arg)
+static int inotify_update_existing_watch(struct fsnotify_group *group,
+ struct inode *inode,
+ u32 arg)
{
- struct fsnotify_mark_entry *entry = NULL;
+ struct fsnotify_mark_entry *entry;
struct inotify_inode_mark_entry *ientry;
- struct inotify_inode_mark_entry *tmp_ientry;
- int ret = 0;
- int add = (arg & IN_MASK_ADD);
- __u32 mask;
__u32 old_mask, new_mask;
+ __u32 mask;
+ int add = (arg & IN_MASK_ADD);
+ int ret;
/* don't allow invalid bits: we don't want flags set */
mask = inotify_arg_to_mask(arg);
if (unlikely(!mask))
return -EINVAL;
- tmp_ientry = kmem_cache_alloc(inotify_inode_mark_cachep, GFP_KERNEL);
- if (unlikely(!tmp_ientry))
- return -ENOMEM;
- /* we set the mask at the end after attaching it */
- fsnotify_init_mark(&tmp_ientry->fsn_entry, inotify_free_mark);
- tmp_ientry->wd = -1;
-
-find_entry:
spin_lock(&inode->i_lock);
entry = fsnotify_find_mark_entry(group, inode);
spin_unlock(&inode->i_lock);
- if (entry) {
- ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry);
- } else {
- ret = -ENOSPC;
- if (atomic_read(&group->inotify_data.user->inotify_watches) >= inotify_max_user_watches)
- goto out_err;
-retry:
- ret = -ENOMEM;
- if (unlikely(!idr_pre_get(&group->inotify_data.idr, GFP_KERNEL)))
- goto out_err;
-
- spin_lock(&group->inotify_data.idr_lock);
- ret = idr_get_new_above(&group->inotify_data.idr, &tmp_ientry->fsn_entry,
- group->inotify_data.last_wd,
- &tmp_ientry->wd);
- spin_unlock(&group->inotify_data.idr_lock);
- if (ret) {
- if (ret == -EAGAIN)
- goto retry;
- goto out_err;
- }
+ if (!entry)
+ return -ENOENT;
- ret = fsnotify_add_mark(&tmp_ientry->fsn_entry, group, inode);
- if (ret) {
- inotify_remove_from_idr(group, tmp_ientry);
- if (ret == -EEXIST)
- goto find_entry;
- goto out_err;
- }
-
- /* tmp_ientry has been added to the inode, so we are all set up.
- * now we just need to make sure tmp_ientry doesn't get freed and
- * we need to set up entry and ientry so the generic code can
- * do its thing. */
- ientry = tmp_ientry;
- entry = &ientry->fsn_entry;
- tmp_ientry = NULL;
-
- atomic_inc(&group->inotify_data.user->inotify_watches);
-
- /* update the idr hint */
- group->inotify_data.last_wd = ientry->wd;
-
- /* we put the mark on the idr, take a reference */
- fsnotify_get_mark(entry);
- }
-
- ret = ientry->wd;
+ ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry);
spin_lock(&entry->lock);
@@ -537,18 +518,107 @@ retry:
fsnotify_recalc_group_mask(group);
}
- /* this either matches fsnotify_find_mark_entry, or init_mark_entry
- * depending on which path we took... */
+ /* return the wd */
+ ret = ientry->wd;
+
+ /* match the get from fsnotify_find_mark_entry() */
fsnotify_put_mark(entry);
+ return ret;
+}
+
+static int inotify_new_watch(struct fsnotify_group *group,
+ struct inode *inode,
+ u32 arg)
+{
+ struct inotify_inode_mark_entry *tmp_ientry;
+ __u32 mask;
+ int ret;
+
+ /* don't allow invalid bits: we don't want flags set */
+ mask = inotify_arg_to_mask(arg);
+ if (unlikely(!mask))
+ return -EINVAL;
+
+ tmp_ientry = kmem_cache_alloc(inotify_inode_mark_cachep, GFP_KERNEL);
+ if (unlikely(!tmp_ientry))
+ return -ENOMEM;
+
+ fsnotify_init_mark(&tmp_ientry->fsn_entry, inotify_free_mark);
+ tmp_ientry->fsn_entry.mask = mask;
+ tmp_ientry->wd = -1;
+
+ ret = -ENOSPC;
+ if (atomic_read(&group->inotify_data.user->inotify_watches) >= inotify_max_user_watches)
+ goto out_err;
+retry:
+ ret = -ENOMEM;
+ if (unlikely(!idr_pre_get(&group->inotify_data.idr, GFP_KERNEL)))
+ goto out_err;
+
+ spin_lock(&group->inotify_data.idr_lock);
+ ret = idr_get_new_above(&group->inotify_data.idr, &tmp_ientry->fsn_entry,
+ group->inotify_data.last_wd,
+ &tmp_ientry->wd);
+ spin_unlock(&group->inotify_data.idr_lock);
+ if (ret) {
+ /* idr was out of memory allocate and try again */
+ if (ret == -EAGAIN)
+ goto retry;
+ goto out_err;
+ }
+
+ /* we put the mark on the idr, take a reference */
+ fsnotify_get_mark(&tmp_ientry->fsn_entry);
+
+ /* we are on the idr, now get on the inode */
+ ret = fsnotify_add_mark(&tmp_ientry->fsn_entry, group, inode);
+ if (ret) {
+ /* we failed to get on the inode, get off the idr */
+ inotify_remove_from_idr(group, tmp_ientry);
+ goto out_err;
+ }
+
+ /* update the idr hint, who cares about races, it's just a hint */
+ group->inotify_data.last_wd = tmp_ientry->wd;
+
+ /* increment the number of watches the user has */
+ atomic_inc(&group->inotify_data.user->inotify_watches);
+
+ /* return the watch descriptor for this new entry */
+ ret = tmp_ientry->wd;
+
+ /* match the ref from fsnotify_init_markentry() */
+ fsnotify_put_mark(&tmp_ientry->fsn_entry);
+
+ /* if this mark added a new event update the group mask */
+ if (mask & ~group->mask)
+ fsnotify_recalc_group_mask(group);
+
out_err:
- /* could be an error, could be that we found an existing mark */
- if (tmp_ientry) {
- /* on the idr but didn't make it on the inode */
- if (tmp_ientry->wd != -1)
- inotify_remove_from_idr(group, tmp_ientry);
+ if (ret < 0)
kmem_cache_free(inotify_inode_mark_cachep, tmp_ientry);
- }
+
+ return ret;
+}
+
+static int inotify_update_watch(struct fsnotify_group *group, struct inode *inode, u32 arg)
+{
+ int ret = 0;
+
+retry:
+ /* try to update and existing watch with the new arg */
+ ret = inotify_update_existing_watch(group, inode, arg);
+ /* no mark present, try to add a new one */
+ if (ret == -ENOENT)
+ ret = inotify_new_watch(group, inode, arg);
+ /*
+ * inotify_new_watch could race with another thread which did an
+ * inotify_new_watch between the update_existing and the add watch
+ * here, go back and try to update an existing mark again.
+ */
+ if (ret == -EEXIST)
+ goto retry;
return ret;
}
@@ -568,7 +638,7 @@ static struct fsnotify_group *inotify_new_group(struct user_struct *user, unsign
spin_lock_init(&group->inotify_data.idr_lock);
idr_init(&group->inotify_data.idr);
- group->inotify_data.last_wd = 0;
+ group->inotify_data.last_wd = 1;
group->inotify_data.user = user;
group->inotify_data.fa = NULL;
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index 521368574e97..3816d5750dd5 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -153,6 +153,10 @@ static bool event_compare(struct fsnotify_event *old, struct fsnotify_event *new
return true;
break;
case (FSNOTIFY_EVENT_NONE):
+ if (old->mask & FS_Q_OVERFLOW)
+ return true;
+ else if (old->mask & FS_IN_IGNORED)
+ return false;
return false;
};
}
@@ -171,9 +175,7 @@ int fsnotify_add_notify_event(struct fsnotify_group *group, struct fsnotify_even
struct list_head *list = &group->notification_list;
struct fsnotify_event_holder *last_holder;
struct fsnotify_event *last_event;
-
- /* easy to tell if priv was attached to the event */
- INIT_LIST_HEAD(&priv->event_list);
+ int ret = 0;
/*
* There is one fsnotify_event_holder embedded inside each fsnotify_event.
@@ -194,6 +196,7 @@ alloc_holder:
if (group->q_len >= group->max_events) {
event = &q_overflow_event;
+ ret = -EOVERFLOW;
/* sorry, no private data on the overflow event */
priv = NULL;
}
@@ -235,7 +238,7 @@ alloc_holder:
mutex_unlock(&group->notification_mutex);
wake_up(&group->notification_waitq);
- return 0;
+ return ret;
}
/*
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 3140a4429af1..4350d4993b18 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -2076,14 +2076,6 @@ err_out:
*ppos = pos;
if (cached_page)
page_cache_release(cached_page);
- /* For now, when the user asks for O_SYNC, we actually give O_DSYNC. */
- if (likely(!status)) {
- if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(vi))) {
- if (!mapping->a_ops->writepage || !is_sync_kiocb(iocb))
- status = generic_osync_inode(vi, mapping,
- OSYNC_METADATA|OSYNC_DATA);
- }
- }
pagevec_lru_add_file(&lru_pvec);
ntfs_debug("Done. Returning %s (written 0x%lx, status %li).",
written ? "written" : "status", (unsigned long)written,
@@ -2145,8 +2137,8 @@ static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
mutex_lock(&inode->i_mutex);
ret = ntfs_file_aio_write_nolock(iocb, iov, nr_segs, &iocb->ki_pos);
mutex_unlock(&inode->i_mutex);
- if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
- int err = sync_page_range(inode, mapping, pos, ret);
+ if (ret > 0) {
+ int err = generic_write_sync(file, pos, ret);
if (err < 0)
ret = err;
}
@@ -2173,8 +2165,8 @@ static ssize_t ntfs_file_writev(struct file *file, const struct iovec *iov,
if (ret == -EIOCBQUEUED)
ret = wait_on_sync_kiocb(&kiocb);
mutex_unlock(&inode->i_mutex);
- if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
- int err = sync_page_range(inode, mapping, *ppos - ret, ret);
+ if (ret > 0) {
+ int err = generic_write_sync(file, *ppos - ret, ret);
if (err < 0)
ret = err;
}
diff --git a/fs/ntfs/layout.h b/fs/ntfs/layout.h
index 50931b1ce4b9..8b2549f672bf 100644
--- a/fs/ntfs/layout.h
+++ b/fs/ntfs/layout.h
@@ -829,7 +829,7 @@ enum {
/* Note, FILE_ATTR_VALID_SET_FLAGS masks out the old DOS VolId, the
F_A_DEVICE, F_A_DIRECTORY, F_A_SPARSE_FILE, F_A_REPARSE_POINT,
F_A_COMPRESSED, and F_A_ENCRYPTED and preserves the rest. This mask
- is used to to obtain all flags that are valid for setting. */
+ is used to obtain all flags that are valid for setting. */
/*
* The flag FILE_ATTR_DUP_FILENAME_INDEX_PRESENT is present in all
* FILENAME_ATTR attributes but not in the STANDARD_INFORMATION
diff --git a/fs/ntfs/malloc.h b/fs/ntfs/malloc.h
index cd0be3f5c3cd..a44b14cbceeb 100644
--- a/fs/ntfs/malloc.h
+++ b/fs/ntfs/malloc.h
@@ -47,7 +47,7 @@ static inline void *__ntfs_malloc(unsigned long size, gfp_t gfp_mask)
return kmalloc(PAGE_SIZE, gfp_mask & ~__GFP_HIGHMEM);
/* return (void *)__get_free_page(gfp_mask); */
}
- if (likely(size >> PAGE_SHIFT < num_physpages))
+ if (likely((size >> PAGE_SHIFT) < totalram_pages))
return __vmalloc(size, gfp_mask, PAGE_KERNEL);
return NULL;
}
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index 23bf68453d7d..1caa0ef0b2bb 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -384,13 +384,12 @@ unm_err_out:
* it is dirty in the inode meta data rather than the data page cache of the
* inode, and thus there are no data pages that need writing out. Therefore, a
* full mark_inode_dirty() is overkill. A mark_inode_dirty_sync(), on the
- * other hand, is not sufficient, because I_DIRTY_DATASYNC needs to be set to
- * ensure ->write_inode is called from generic_osync_inode() and this needs to
- * happen or the file data would not necessarily hit the device synchronously,
- * even though the vfs inode has the O_SYNC flag set. Also, I_DIRTY_DATASYNC
- * simply "feels" better than just I_DIRTY_SYNC, since the file data has not
- * actually hit the block device yet, which is not what I_DIRTY_SYNC on its own
- * would suggest.
+ * other hand, is not sufficient, because ->write_inode needs to be called even
+ * in case of fdatasync. This needs to happen or the file data would not
+ * necessarily hit the device synchronously, even though the vfs inode has the
+ * O_SYNC flag set. Also, I_DIRTY_DATASYNC simply "feels" better than just
+ * I_DIRTY_SYNC, since the file data has not actually hit the block device yet,
+ * which is not what I_DIRTY_SYNC on its own would suggest.
*/
void __mark_mft_record_dirty(ntfs_inode *ni)
{
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 9edcde4974aa..ab513ddaeff2 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -1914,7 +1914,8 @@ static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec,
* immediately to their right.
*/
left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos);
- if (ocfs2_is_empty_extent(&right_child_el->l_recs[0])) {
+ if (!ocfs2_rec_clusters(right_child_el, &right_child_el->l_recs[0])) {
+ BUG_ON(right_child_el->l_tree_depth);
BUG_ON(le16_to_cpu(right_child_el->l_next_free_rec) <= 1);
left_clusters = le32_to_cpu(right_child_el->l_recs[1].e_cpos);
}
@@ -2476,15 +2477,37 @@ out_ret_path:
return ret;
}
-static void ocfs2_update_edge_lengths(struct inode *inode, handle_t *handle,
- struct ocfs2_path *path)
+static int ocfs2_update_edge_lengths(struct inode *inode, handle_t *handle,
+ int subtree_index, struct ocfs2_path *path)
{
- int i, idx;
+ int i, idx, ret;
struct ocfs2_extent_rec *rec;
struct ocfs2_extent_list *el;
struct ocfs2_extent_block *eb;
u32 range;
+ /*
+ * In normal tree rotation process, we will never touch the
+ * tree branch above subtree_index and ocfs2_extend_rotate_transaction
+ * doesn't reserve the credits for them either.
+ *
+ * But we do have a special case here which will update the rightmost
+ * records for all the bh in the path.
+ * So we have to allocate extra credits and access them.
+ */
+ ret = ocfs2_extend_trans(handle,
+ handle->h_buffer_credits + subtree_index);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_journal_access_path(inode, handle, path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
/* Path should always be rightmost. */
eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
BUG_ON(eb->h_next_leaf_blk != 0ULL);
@@ -2505,6 +2528,8 @@ static void ocfs2_update_edge_lengths(struct inode *inode, handle_t *handle,
ocfs2_journal_dirty(handle, path->p_node[i].bh);
}
+out:
+ return ret;
}
static void ocfs2_unlink_path(struct inode *inode, handle_t *handle,
@@ -2717,7 +2742,12 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
if (del_right_subtree) {
ocfs2_unlink_subtree(inode, handle, left_path, right_path,
subtree_index, dealloc);
- ocfs2_update_edge_lengths(inode, handle, left_path);
+ ret = ocfs2_update_edge_lengths(inode, handle, subtree_index,
+ left_path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
@@ -3034,7 +3064,12 @@ static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
ocfs2_unlink_subtree(inode, handle, left_path, path,
subtree_index, dealloc);
- ocfs2_update_edge_lengths(inode, handle, left_path);
+ ret = ocfs2_update_edge_lengths(inode, handle, subtree_index,
+ left_path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
@@ -6816,7 +6851,7 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
}
status = 0;
bail:
-
+ brelse(last_eb_bh);
mlog_exit(status);
return status;
}
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index b2c52b3a1484..8a1e61545f41 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -193,6 +193,7 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock,
(unsigned long long)OCFS2_I(inode)->ip_blkno);
mlog(ML_ERROR, "Size %llu, clusters %u\n", (unsigned long long)i_size_read(inode), OCFS2_I(inode)->ip_clusters);
dump_stack();
+ goto bail;
}
past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
@@ -894,18 +895,17 @@ struct ocfs2_write_cluster_desc {
*/
unsigned c_new;
unsigned c_unwritten;
+ unsigned c_needs_zero;
};
-static inline int ocfs2_should_zero_cluster(struct ocfs2_write_cluster_desc *d)
-{
- return d->c_new || d->c_unwritten;
-}
-
struct ocfs2_write_ctxt {
/* Logical cluster position / len of write */
u32 w_cpos;
u32 w_clen;
+ /* First cluster allocated in a nonsparse extend */
+ u32 w_first_new_cpos;
+
struct ocfs2_write_cluster_desc w_desc[OCFS2_MAX_CLUSTERS_PER_PAGE];
/*
@@ -983,6 +983,7 @@ static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp,
return -ENOMEM;
wc->w_cpos = pos >> osb->s_clustersize_bits;
+ wc->w_first_new_cpos = UINT_MAX;
cend = (pos + len - 1) >> osb->s_clustersize_bits;
wc->w_clen = cend - wc->w_cpos + 1;
get_bh(di_bh);
@@ -1217,20 +1218,18 @@ out:
*/
static int ocfs2_write_cluster(struct address_space *mapping,
u32 phys, unsigned int unwritten,
+ unsigned int should_zero,
struct ocfs2_alloc_context *data_ac,
struct ocfs2_alloc_context *meta_ac,
struct ocfs2_write_ctxt *wc, u32 cpos,
loff_t user_pos, unsigned user_len)
{
- int ret, i, new, should_zero = 0;
+ int ret, i, new;
u64 v_blkno, p_blkno;
struct inode *inode = mapping->host;
struct ocfs2_extent_tree et;
new = phys == 0 ? 1 : 0;
- if (new || unwritten)
- should_zero = 1;
-
if (new) {
u32 tmp_pos;
@@ -1301,7 +1300,7 @@ static int ocfs2_write_cluster(struct address_space *mapping,
if (tmpret) {
mlog_errno(tmpret);
if (ret == 0)
- tmpret = ret;
+ ret = tmpret;
}
}
@@ -1341,7 +1340,9 @@ static int ocfs2_write_cluster_by_desc(struct address_space *mapping,
local_len = osb->s_clustersize - cluster_off;
ret = ocfs2_write_cluster(mapping, desc->c_phys,
- desc->c_unwritten, data_ac, meta_ac,
+ desc->c_unwritten,
+ desc->c_needs_zero,
+ data_ac, meta_ac,
wc, desc->c_cpos, pos, local_len);
if (ret) {
mlog_errno(ret);
@@ -1391,14 +1392,14 @@ static void ocfs2_set_target_boundaries(struct ocfs2_super *osb,
* newly allocated cluster.
*/
desc = &wc->w_desc[0];
- if (ocfs2_should_zero_cluster(desc))
+ if (desc->c_needs_zero)
ocfs2_figure_cluster_boundaries(osb,
desc->c_cpos,
&wc->w_target_from,
NULL);
desc = &wc->w_desc[wc->w_clen - 1];
- if (ocfs2_should_zero_cluster(desc))
+ if (desc->c_needs_zero)
ocfs2_figure_cluster_boundaries(osb,
desc->c_cpos,
NULL,
@@ -1466,13 +1467,28 @@ static int ocfs2_populate_write_desc(struct inode *inode,
phys++;
}
+ /*
+ * If w_first_new_cpos is < UINT_MAX, we have a non-sparse
+ * file that got extended. w_first_new_cpos tells us
+ * where the newly allocated clusters are so we can
+ * zero them.
+ */
+ if (desc->c_cpos >= wc->w_first_new_cpos) {
+ BUG_ON(phys == 0);
+ desc->c_needs_zero = 1;
+ }
+
desc->c_phys = phys;
if (phys == 0) {
desc->c_new = 1;
+ desc->c_needs_zero = 1;
*clusters_to_alloc = *clusters_to_alloc + 1;
}
- if (ext_flags & OCFS2_EXT_UNWRITTEN)
+
+ if (ext_flags & OCFS2_EXT_UNWRITTEN) {
desc->c_unwritten = 1;
+ desc->c_needs_zero = 1;
+ }
num_clusters--;
}
@@ -1632,10 +1648,13 @@ static int ocfs2_expand_nonsparse_inode(struct inode *inode, loff_t pos,
if (newsize <= i_size_read(inode))
return 0;
- ret = ocfs2_extend_no_holes(inode, newsize, newsize - len);
+ ret = ocfs2_extend_no_holes(inode, newsize, pos);
if (ret)
mlog_errno(ret);
+ wc->w_first_new_cpos =
+ ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode));
+
return ret;
}
@@ -1644,7 +1663,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
struct page **pagep, void **fsdata,
struct buffer_head *di_bh, struct page *mmap_page)
{
- int ret, credits = OCFS2_INODE_UPDATE_CREDITS;
+ int ret, cluster_of_pages, credits = OCFS2_INODE_UPDATE_CREDITS;
unsigned int clusters_to_alloc, extents_to_split;
struct ocfs2_write_ctxt *wc;
struct inode *inode = mapping->host;
@@ -1722,8 +1741,19 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
}
- ocfs2_set_target_boundaries(osb, wc, pos, len,
- clusters_to_alloc + extents_to_split);
+ /*
+ * We have to zero sparse allocated clusters, unwritten extent clusters,
+ * and non-sparse clusters we just extended. For non-sparse writes,
+ * we know zeros will only be needed in the first and/or last cluster.
+ */
+ if (clusters_to_alloc || extents_to_split ||
+ (wc->w_clen && (wc->w_desc[0].c_needs_zero ||
+ wc->w_desc[wc->w_clen - 1].c_needs_zero)))
+ cluster_of_pages = 1;
+ else
+ cluster_of_pages = 0;
+
+ ocfs2_set_target_boundaries(osb, wc, pos, len, cluster_of_pages);
handle = ocfs2_start_trans(osb, credits);
if (IS_ERR(handle)) {
@@ -1756,8 +1786,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
* extent.
*/
ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos,
- clusters_to_alloc + extents_to_split,
- mmap_page);
+ cluster_of_pages, mmap_page);
if (ret) {
mlog_errno(ret);
goto out_quota;
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index b574431a031d..b4957c7d9fe2 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -85,6 +85,17 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry,
goto bail;
}
+ /*
+ * If the last lookup failed to create dentry lock, let us
+ * redo it.
+ */
+ if (!dentry->d_fsdata) {
+ mlog(0, "Inode %llu doesn't have dentry lock, "
+ "returning false\n",
+ (unsigned long long)OCFS2_I(inode)->ip_blkno);
+ goto bail;
+ }
+
ret = 1;
bail:
@@ -310,22 +321,19 @@ out_attach:
return ret;
}
-static DEFINE_SPINLOCK(dentry_list_lock);
+DEFINE_SPINLOCK(dentry_list_lock);
/* We limit the number of dentry locks to drop in one go. We have
* this limit so that we don't starve other users of ocfs2_wq. */
#define DL_INODE_DROP_COUNT 64
/* Drop inode references from dentry locks */
-void ocfs2_drop_dl_inodes(struct work_struct *work)
+static void __ocfs2_drop_dl_inodes(struct ocfs2_super *osb, int drop_count)
{
- struct ocfs2_super *osb = container_of(work, struct ocfs2_super,
- dentry_lock_work);
struct ocfs2_dentry_lock *dl;
- int drop_count = DL_INODE_DROP_COUNT;
spin_lock(&dentry_list_lock);
- while (osb->dentry_lock_list && drop_count--) {
+ while (osb->dentry_lock_list && (drop_count < 0 || drop_count--)) {
dl = osb->dentry_lock_list;
osb->dentry_lock_list = dl->dl_next;
spin_unlock(&dentry_list_lock);
@@ -333,11 +341,32 @@ void ocfs2_drop_dl_inodes(struct work_struct *work)
kfree(dl);
spin_lock(&dentry_list_lock);
}
- if (osb->dentry_lock_list)
+ spin_unlock(&dentry_list_lock);
+}
+
+void ocfs2_drop_dl_inodes(struct work_struct *work)
+{
+ struct ocfs2_super *osb = container_of(work, struct ocfs2_super,
+ dentry_lock_work);
+
+ __ocfs2_drop_dl_inodes(osb, DL_INODE_DROP_COUNT);
+ /*
+ * Don't queue dropping if umount is in progress. We flush the
+ * list in ocfs2_dismount_volume
+ */
+ spin_lock(&dentry_list_lock);
+ if (osb->dentry_lock_list &&
+ !ocfs2_test_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED))
queue_work(ocfs2_wq, &osb->dentry_lock_work);
spin_unlock(&dentry_list_lock);
}
+/* Flush the whole work queue */
+void ocfs2_drop_all_dl_inodes(struct ocfs2_super *osb)
+{
+ __ocfs2_drop_dl_inodes(osb, -1);
+}
+
/*
* ocfs2_dentry_iput() and friends.
*
@@ -368,7 +397,8 @@ static void ocfs2_drop_dentry_lock(struct ocfs2_super *osb,
/* We leave dropping of inode reference to ocfs2_wq as that can
* possibly lead to inode deletion which gets tricky */
spin_lock(&dentry_list_lock);
- if (!osb->dentry_lock_list)
+ if (!osb->dentry_lock_list &&
+ !ocfs2_test_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED))
queue_work(ocfs2_wq, &osb->dentry_lock_work);
dl->dl_next = osb->dentry_lock_list;
osb->dentry_lock_list = dl;
diff --git a/fs/ocfs2/dcache.h b/fs/ocfs2/dcache.h
index faa12e75f98d..f5dd1789acf1 100644
--- a/fs/ocfs2/dcache.h
+++ b/fs/ocfs2/dcache.h
@@ -49,10 +49,13 @@ struct ocfs2_dentry_lock {
int ocfs2_dentry_attach_lock(struct dentry *dentry, struct inode *inode,
u64 parent_blkno);
+extern spinlock_t dentry_list_lock;
+
void ocfs2_dentry_lock_put(struct ocfs2_super *osb,
struct ocfs2_dentry_lock *dl);
void ocfs2_drop_dl_inodes(struct work_struct *work);
+void ocfs2_drop_all_dl_inodes(struct ocfs2_super *osb);
struct dentry *ocfs2_find_local_alias(struct inode *inode, u64 parent_blkno,
int skip_unhashed);
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index d07ddbe4b283..81eff8e58322 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -103,7 +103,6 @@ static void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
lock->ast_pending, lock->ml.type);
BUG();
}
- BUG_ON(!list_empty(&lock->ast_list));
if (lock->ast_pending)
mlog(0, "lock has an ast getting flushed right now\n");
diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c
index 1c9efb406a96..02bf17808bdc 100644
--- a/fs/ocfs2/dlm/dlmfs.c
+++ b/fs/ocfs2/dlm/dlmfs.c
@@ -325,6 +325,7 @@ clear_fields:
}
static struct backing_dev_info dlmfs_backing_dev_info = {
+ .name = "ocfs2-dlmfs",
.ra_pages = 0, /* No readahead */
.capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
};
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index bcb9260c3735..43e6e3280569 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -1118,7 +1118,7 @@ static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm,
mlog(0, "%s:%.*s: sending mig lockres (%s) to %u\n",
dlm->name, res->lockname.len, res->lockname.name,
- orig_flags & DLM_MRES_MIGRATION ? "migrate" : "recovery",
+ orig_flags & DLM_MRES_MIGRATION ? "migration" : "recovery",
send_to);
/* send it */
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index fcf879ed6930..756f5b0998e0 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -122,7 +122,7 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm,
* that still has AST's pending... */
in_use = !list_empty(&lock->ast_list);
spin_unlock(&dlm->ast_lock);
- if (in_use) {
+ if (in_use && !(flags & LKM_CANCEL)) {
mlog(ML_ERROR, "lockres %.*s: Someone is calling dlmunlock "
"while waiting for an ast!", res->lockname.len,
res->lockname.name);
@@ -131,7 +131,7 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm,
spin_lock(&res->spinlock);
if (res->state & DLM_LOCK_RES_IN_PROGRESS) {
- if (master_node) {
+ if (master_node && !(flags & LKM_CANCEL)) {
mlog(ML_ERROR, "lockres in progress!\n");
spin_unlock(&res->spinlock);
return DLM_FORWARD;
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 62442e413a00..221c5e98957b 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1851,6 +1851,7 @@ relock:
if (ret)
goto out_dio;
+ count = ocount;
ret = generic_write_checks(file, ppos, &count,
S_ISBLK(inode->i_mode));
if (ret)
@@ -1870,8 +1871,7 @@ relock:
goto out_dio;
}
} else {
- written = generic_file_aio_write_nolock(iocb, iov, nr_segs,
- *ppos);
+ written = __generic_file_aio_write(iocb, iov, nr_segs, ppos);
}
out_dio:
@@ -1879,18 +1879,21 @@ out_dio:
BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT));
if ((file->f_flags & O_SYNC && !direct_io) || IS_SYNC(inode)) {
- /*
- * The generic write paths have handled getting data
- * to disk, but since we don't make use of the dirty
- * inode list, a manual journal commit is necessary
- * here.
- */
- if (old_size != i_size_read(inode) ||
- old_clusters != OCFS2_I(inode)->ip_clusters) {
+ ret = filemap_fdatawrite_range(file->f_mapping, pos,
+ pos + count - 1);
+ if (ret < 0)
+ written = ret;
+
+ if (!ret && (old_size != i_size_read(inode) ||
+ old_clusters != OCFS2_I(inode)->ip_clusters)) {
ret = jbd2_journal_force_commit(osb->journal->j_journal);
if (ret < 0)
written = ret;
}
+
+ if (!ret)
+ ret = filemap_fdatawait_range(file->f_mapping, pos,
+ pos + count - 1);
}
/*
@@ -1918,8 +1921,10 @@ out_sems:
mutex_unlock(&inode->i_mutex);
+ if (written)
+ ret = written;
mlog_exit(ret);
- return written ? written : ret;
+ return ret;
}
static int ocfs2_splice_to_file(struct pipe_inode_info *pipe,
@@ -1988,31 +1993,16 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
if (ret > 0) {
unsigned long nr_pages;
+ int err;
- *ppos += ret;
nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
- /*
- * If file or inode is SYNC and we actually wrote some data,
- * sync it.
- */
- if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) {
- int err;
-
- mutex_lock(&inode->i_mutex);
- err = ocfs2_rw_lock(inode, 1);
- if (err < 0) {
- mlog_errno(err);
- } else {
- err = generic_osync_inode(inode, mapping,
- OSYNC_METADATA|OSYNC_DATA);
- ocfs2_rw_unlock(inode, 1);
- }
- mutex_unlock(&inode->i_mutex);
+ err = generic_write_sync(out, *ppos, ret);
+ if (err)
+ ret = err;
+ else
+ *ppos += ret;
- if (err)
- ret = err;
- }
balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
}
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index f033760ecbea..c48b93ac6b65 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -1954,10 +1954,16 @@ void ocfs2_orphan_scan_init(struct ocfs2_super *osb)
os->os_osb = osb;
os->os_count = 0;
os->os_seqno = 0;
- os->os_scantime = CURRENT_TIME;
mutex_init(&os->os_lock);
INIT_DELAYED_WORK(&os->os_orphan_scan_work, ocfs2_orphan_scan_work);
+}
+void ocfs2_orphan_scan_start(struct ocfs2_super *osb)
+{
+ struct ocfs2_orphan_scan *os;
+
+ os = &osb->osb_orphan_scan;
+ os->os_scantime = CURRENT_TIME;
if (ocfs2_is_hard_readonly(osb) || ocfs2_mount_local(osb))
atomic_set(&os->os_state, ORPHAN_SCAN_INACTIVE);
else {
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 5432c7f79cc6..2c3222aec622 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -145,6 +145,7 @@ static inline void ocfs2_inode_set_new(struct ocfs2_super *osb,
/* Exported only for the journal struct init code in super.c. Do not call. */
void ocfs2_orphan_scan_init(struct ocfs2_super *osb);
+void ocfs2_orphan_scan_start(struct ocfs2_super *osb);
void ocfs2_orphan_scan_stop(struct ocfs2_super *osb);
void ocfs2_orphan_scan_exit(struct ocfs2_super *osb);
@@ -329,20 +330,27 @@ int ocfs2_journal_dirty(handle_t *handle,
/* extended attribute block update */
#define OCFS2_XATTR_BLOCK_UPDATE_CREDITS 1
+/* Update of a single quota block */
+#define OCFS2_QUOTA_BLOCK_UPDATE_CREDITS 1
+
/* global quotafile inode update, data block */
-#define OCFS2_QINFO_WRITE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1)
+#define OCFS2_QINFO_WRITE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + \
+ OCFS2_QUOTA_BLOCK_UPDATE_CREDITS)
+#define OCFS2_LOCAL_QINFO_WRITE_CREDITS OCFS2_QUOTA_BLOCK_UPDATE_CREDITS
/*
* The two writes below can accidentally see global info dirty due
* to set_info() quotactl so make them prepared for the writes.
*/
/* quota data block, global info */
/* Write to local quota file */
-#define OCFS2_QWRITE_CREDITS (OCFS2_QINFO_WRITE_CREDITS + 1)
+#define OCFS2_QWRITE_CREDITS (OCFS2_QINFO_WRITE_CREDITS + \
+ OCFS2_QUOTA_BLOCK_UPDATE_CREDITS)
/* global quota data block, local quota data block, global quota inode,
* global quota info */
-#define OCFS2_QSYNC_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 3)
+#define OCFS2_QSYNC_CREDITS (OCFS2_QINFO_WRITE_CREDITS + \
+ 2 * OCFS2_QUOTA_BLOCK_UPDATE_CREDITS)
static inline int ocfs2_quota_trans_credits(struct super_block *sb)
{
@@ -355,11 +363,6 @@ static inline int ocfs2_quota_trans_credits(struct super_block *sb)
return credits;
}
-/* Number of credits needed for removing quota structure from file */
-int ocfs2_calc_qdel_credits(struct super_block *sb, int type);
-/* Number of credits needed for initialization of new quota structure */
-int ocfs2_calc_qinit_credits(struct super_block *sb, int type);
-
/* group extend. inode update and last group update. */
#define OCFS2_GROUP_EXTEND_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1)
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index c9345ebb8493..39e1d5a39505 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -224,10 +224,12 @@ enum ocfs2_mount_options
OCFS2_MOUNT_GRPQUOTA = 1 << 10, /* We support group quotas */
};
-#define OCFS2_OSB_SOFT_RO 0x0001
-#define OCFS2_OSB_HARD_RO 0x0002
-#define OCFS2_OSB_ERROR_FS 0x0004
-#define OCFS2_DEFAULT_ATIME_QUANTUM 60
+#define OCFS2_OSB_SOFT_RO 0x0001
+#define OCFS2_OSB_HARD_RO 0x0002
+#define OCFS2_OSB_ERROR_FS 0x0004
+#define OCFS2_OSB_DROP_DENTRY_LOCK_IMMED 0x0008
+
+#define OCFS2_DEFAULT_ATIME_QUANTUM 60
struct ocfs2_journal;
struct ocfs2_slot_info;
@@ -490,6 +492,18 @@ static inline void ocfs2_set_osb_flag(struct ocfs2_super *osb,
spin_unlock(&osb->osb_lock);
}
+
+static inline unsigned long ocfs2_test_osb_flag(struct ocfs2_super *osb,
+ unsigned long flag)
+{
+ unsigned long ret;
+
+ spin_lock(&osb->osb_lock);
+ ret = osb->osb_flags & flag;
+ spin_unlock(&osb->osb_lock);
+ return ret;
+}
+
static inline void ocfs2_set_ro_flag(struct ocfs2_super *osb,
int hard)
{
diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h
index fcdba091af3d..c212cf5a2bdf 100644
--- a/fs/ocfs2/ocfs2_lockid.h
+++ b/fs/ocfs2/ocfs2_lockid.h
@@ -108,6 +108,7 @@ static char *ocfs2_lock_type_strings[] = {
[OCFS2_LOCK_TYPE_OPEN] = "Open",
[OCFS2_LOCK_TYPE_FLOCK] = "Flock",
[OCFS2_LOCK_TYPE_QINFO] = "Quota",
+ [OCFS2_LOCK_TYPE_NFS_SYNC] = "NFSSync",
[OCFS2_LOCK_TYPE_ORPHAN_SCAN] = "OrphanScan",
};
diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h
index 7365e2e08706..e5df9d170b0c 100644
--- a/fs/ocfs2/quota.h
+++ b/fs/ocfs2/quota.h
@@ -50,7 +50,6 @@ struct ocfs2_mem_dqinfo {
unsigned int dqi_chunks; /* Number of chunks in local quota file */
unsigned int dqi_blocks; /* Number of blocks allocated for local quota file */
unsigned int dqi_syncms; /* How often should we sync with other nodes */
- unsigned int dqi_syncjiff; /* Precomputed dqi_syncms in jiffies */
struct list_head dqi_chunk; /* List of chunks */
struct inode *dqi_gqinode; /* Global quota file inode */
struct ocfs2_lock_res dqi_gqlock; /* Lock protecting quota information structure */
@@ -110,7 +109,7 @@ void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex);
int ocfs2_read_quota_block(struct inode *inode, u64 v_block,
struct buffer_head **bh);
-extern struct dquot_operations ocfs2_quota_operations;
+extern const struct dquot_operations ocfs2_quota_operations;
extern struct quota_format_type ocfs2_quota_format;
int ocfs2_quota_setup(void);
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index edfa60cd155c..3cf0ec0acdd5 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -23,6 +23,7 @@
#include "sysfile.h"
#include "dlmglue.h"
#include "uptodate.h"
+#include "super.h"
#include "quota.h"
static struct workqueue_struct *ocfs2_quota_wq = NULL;
@@ -69,6 +70,7 @@ static void ocfs2_global_mem2diskdqb(void *dp, struct dquot *dquot)
d->dqb_curspace = cpu_to_le64(m->dqb_curspace);
d->dqb_btime = cpu_to_le64(m->dqb_btime);
d->dqb_itime = cpu_to_le64(m->dqb_itime);
+ d->dqb_pad1 = d->dqb_pad2 = 0;
}
static int ocfs2_global_is_id(void *dp, struct dquot *dquot)
@@ -113,6 +115,15 @@ int ocfs2_read_quota_block(struct inode *inode, u64 v_block,
int rc = 0;
struct buffer_head *tmp = *bh;
+ if (i_size_read(inode) >> inode->i_sb->s_blocksize_bits <= v_block) {
+ ocfs2_error(inode->i_sb,
+ "Quota file %llu is probably corrupted! Requested "
+ "to read block %Lu but file has size only %Lu\n",
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (unsigned long long)v_block,
+ (unsigned long long)i_size_read(inode));
+ return -EIO;
+ }
rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, 0,
ocfs2_validate_quota_block);
if (rc)
@@ -143,7 +154,7 @@ static int ocfs2_get_quota_block(struct inode *inode, int block,
err = -EIO;
mlog_errno(err);
}
- return err;;
+ return err;
}
/* Read data from global quotafile - avoid pagecache and such because we cannot
@@ -211,14 +222,13 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type,
mutex_lock_nested(&gqinode->i_mutex, I_MUTEX_QUOTA);
if (gqinode->i_size < off + len) {
- down_write(&OCFS2_I(gqinode)->ip_alloc_sem);
- err = ocfs2_extend_no_holes(gqinode, off + len, off);
- up_write(&OCFS2_I(gqinode)->ip_alloc_sem);
- if (err < 0)
- goto out;
+ loff_t rounded_end =
+ ocfs2_align_bytes_to_blocks(sb, off + len);
+
+ /* Space is already allocated in ocfs2_global_read_dquot() */
err = ocfs2_simple_size_update(gqinode,
oinfo->dqi_gqi_bh,
- off + len);
+ rounded_end);
if (err < 0)
goto out;
new = 1;
@@ -234,7 +244,7 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type,
}
if (err) {
mlog_errno(err);
- return err;
+ goto out;
}
lock_buffer(bh);
if (new)
@@ -342,7 +352,6 @@ int ocfs2_global_read_info(struct super_block *sb, int type)
info->dqi_bgrace = le32_to_cpu(dinfo.dqi_bgrace);
info->dqi_igrace = le32_to_cpu(dinfo.dqi_igrace);
oinfo->dqi_syncms = le32_to_cpu(dinfo.dqi_syncms);
- oinfo->dqi_syncjiff = msecs_to_jiffies(oinfo->dqi_syncms);
oinfo->dqi_gi.dqi_blocks = le32_to_cpu(dinfo.dqi_blocks);
oinfo->dqi_gi.dqi_free_blk = le32_to_cpu(dinfo.dqi_free_blk);
oinfo->dqi_gi.dqi_free_entry = le32_to_cpu(dinfo.dqi_free_entry);
@@ -352,7 +361,7 @@ int ocfs2_global_read_info(struct super_block *sb, int type)
oinfo->dqi_gi.dqi_qtree_depth = qtree_depth(&oinfo->dqi_gi);
INIT_DELAYED_WORK(&oinfo->dqi_sync_work, qsync_work_fn);
queue_delayed_work(ocfs2_quota_wq, &oinfo->dqi_sync_work,
- oinfo->dqi_syncjiff);
+ msecs_to_jiffies(oinfo->dqi_syncms));
out_err:
mlog_exit(status);
@@ -402,13 +411,36 @@ int ocfs2_global_write_info(struct super_block *sb, int type)
return err;
}
+static int ocfs2_global_qinit_alloc(struct super_block *sb, int type)
+{
+ struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
+
+ /*
+ * We may need to allocate tree blocks and a leaf block but not the
+ * root block
+ */
+ return oinfo->dqi_gi.dqi_qtree_depth;
+}
+
+static int ocfs2_calc_global_qinit_credits(struct super_block *sb, int type)
+{
+ /* We modify all the allocated blocks, tree root, and info block */
+ return (ocfs2_global_qinit_alloc(sb, type) + 2) *
+ OCFS2_QUOTA_BLOCK_UPDATE_CREDITS;
+}
+
/* Read in information from global quota file and acquire a reference to it.
* dquot_acquire() has already started the transaction and locked quota file */
int ocfs2_global_read_dquot(struct dquot *dquot)
{
int err, err2, ex = 0;
- struct ocfs2_mem_dqinfo *info =
- sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
+ struct super_block *sb = dquot->dq_sb;
+ int type = dquot->dq_type;
+ struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv;
+ struct ocfs2_super *osb = OCFS2_SB(sb);
+ struct inode *gqinode = info->dqi_gqinode;
+ int need_alloc = ocfs2_global_qinit_alloc(sb, type);
+ handle_t *handle = NULL;
err = ocfs2_qinfo_lock(info, 0);
if (err < 0)
@@ -419,14 +451,33 @@ int ocfs2_global_read_dquot(struct dquot *dquot)
OCFS2_DQUOT(dquot)->dq_use_count++;
OCFS2_DQUOT(dquot)->dq_origspace = dquot->dq_dqb.dqb_curspace;
OCFS2_DQUOT(dquot)->dq_originodes = dquot->dq_dqb.dqb_curinodes;
+ ocfs2_qinfo_unlock(info, 0);
+
if (!dquot->dq_off) { /* No real quota entry? */
- /* Upgrade to exclusive lock for allocation */
- ocfs2_qinfo_unlock(info, 0);
- err = ocfs2_qinfo_lock(info, 1);
- if (err < 0)
- goto out_qlock;
ex = 1;
+ /*
+ * Add blocks to quota file before we start a transaction since
+ * locking allocators ranks above a transaction start
+ */
+ WARN_ON(journal_current_handle());
+ down_write(&OCFS2_I(gqinode)->ip_alloc_sem);
+ err = ocfs2_extend_no_holes(gqinode,
+ gqinode->i_size + (need_alloc << sb->s_blocksize_bits),
+ gqinode->i_size);
+ up_write(&OCFS2_I(gqinode)->ip_alloc_sem);
+ if (err < 0)
+ goto out;
+ }
+
+ handle = ocfs2_start_trans(osb,
+ ocfs2_calc_global_qinit_credits(sb, type));
+ if (IS_ERR(handle)) {
+ err = PTR_ERR(handle);
+ goto out;
}
+ err = ocfs2_qinfo_lock(info, ex);
+ if (err < 0)
+ goto out_trans;
err = qtree_write_dquot(&info->dqi_gi, dquot);
if (ex && info_dirty(sb_dqinfo(dquot->dq_sb, dquot->dq_type))) {
err2 = __ocfs2_global_write_info(dquot->dq_sb, dquot->dq_type);
@@ -438,6 +489,9 @@ out_qlock:
ocfs2_qinfo_unlock(info, 1);
else
ocfs2_qinfo_unlock(info, 0);
+out_trans:
+ if (handle)
+ ocfs2_commit_trans(osb, handle);
out:
if (err < 0)
mlog_errno(err);
@@ -607,7 +661,7 @@ static void qsync_work_fn(struct work_struct *work)
dquot_scan_active(sb, ocfs2_sync_dquot_helper, oinfo->dqi_type);
queue_delayed_work(ocfs2_quota_wq, &oinfo->dqi_sync_work,
- oinfo->dqi_syncjiff);
+ msecs_to_jiffies(oinfo->dqi_syncms));
}
/*
@@ -635,20 +689,18 @@ out:
return status;
}
-int ocfs2_calc_qdel_credits(struct super_block *sb, int type)
+static int ocfs2_calc_qdel_credits(struct super_block *sb, int type)
{
- struct ocfs2_mem_dqinfo *oinfo;
- int features[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
- OCFS2_FEATURE_RO_COMPAT_GRPQUOTA };
-
- if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, features[type]))
- return 0;
-
- oinfo = sb_dqinfo(sb, type)->dqi_priv;
- /* We modify tree, leaf block, global info, local chunk header,
- * global and local inode */
- return oinfo->dqi_gi.dqi_qtree_depth + 2 + 1 +
- 2 * OCFS2_INODE_UPDATE_CREDITS;
+ struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
+ /*
+ * We modify tree, leaf block, global info, local chunk header,
+ * global and local inode; OCFS2_QINFO_WRITE_CREDITS already
+ * accounts for inode update
+ */
+ return (oinfo->dqi_gi.dqi_qtree_depth + 2) *
+ OCFS2_QUOTA_BLOCK_UPDATE_CREDITS +
+ OCFS2_QINFO_WRITE_CREDITS +
+ OCFS2_INODE_UPDATE_CREDITS;
}
static int ocfs2_release_dquot(struct dquot *dquot)
@@ -680,33 +732,10 @@ out:
return status;
}
-int ocfs2_calc_qinit_credits(struct super_block *sb, int type)
-{
- struct ocfs2_mem_dqinfo *oinfo;
- int features[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
- OCFS2_FEATURE_RO_COMPAT_GRPQUOTA };
- struct ocfs2_dinode *lfe, *gfe;
-
- if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, features[type]))
- return 0;
-
- oinfo = sb_dqinfo(sb, type)->dqi_priv;
- gfe = (struct ocfs2_dinode *)oinfo->dqi_gqi_bh->b_data;
- lfe = (struct ocfs2_dinode *)oinfo->dqi_lqi_bh->b_data;
- /* We can extend local file + global file. In local file we
- * can modify info, chunk header block and dquot block. In
- * global file we can modify info, tree and leaf block */
- return ocfs2_calc_extend_credits(sb, &lfe->id2.i_list, 0) +
- ocfs2_calc_extend_credits(sb, &gfe->id2.i_list, 0) +
- 3 + oinfo->dqi_gi.dqi_qtree_depth + 2;
-}
-
static int ocfs2_acquire_dquot(struct dquot *dquot)
{
- handle_t *handle;
struct ocfs2_mem_dqinfo *oinfo =
sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
- struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb);
int status = 0;
mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type);
@@ -715,16 +744,7 @@ static int ocfs2_acquire_dquot(struct dquot *dquot)
status = ocfs2_lock_global_qf(oinfo, 1);
if (status < 0)
goto out;
- handle = ocfs2_start_trans(osb,
- ocfs2_calc_qinit_credits(dquot->dq_sb, dquot->dq_type));
- if (IS_ERR(handle)) {
- status = PTR_ERR(handle);
- mlog_errno(status);
- goto out_ilock;
- }
status = dquot_acquire(dquot);
- ocfs2_commit_trans(osb, handle);
-out_ilock:
ocfs2_unlock_global_qf(oinfo, 1);
out:
mlog_exit(status);
@@ -829,7 +849,7 @@ static void ocfs2_destroy_dquot(struct dquot *dquot)
kmem_cache_free(ocfs2_dquot_cachep, dquot);
}
-struct dquot_operations ocfs2_quota_operations = {
+const struct dquot_operations ocfs2_quota_operations = {
.initialize = dquot_initialize,
.drop = dquot_drop,
.alloc_space = dquot_alloc_space,
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index 5a460fa82553..bdb09cb6e1fe 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -20,6 +20,7 @@
#include "sysfile.h"
#include "dlmglue.h"
#include "quota.h"
+#include "uptodate.h"
/* Number of local quota structures per block */
static inline unsigned int ol_quota_entries_per_block(struct super_block *sb)
@@ -100,7 +101,8 @@ static int ocfs2_modify_bh(struct inode *inode, struct buffer_head *bh,
handle_t *handle;
int status;
- handle = ocfs2_start_trans(OCFS2_SB(sb), 1);
+ handle = ocfs2_start_trans(OCFS2_SB(sb),
+ OCFS2_QUOTA_BLOCK_UPDATE_CREDITS);
if (IS_ERR(handle)) {
status = PTR_ERR(handle);
mlog_errno(status);
@@ -610,7 +612,8 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
goto out_bh;
/* Mark quota file as clean if we are recovering quota file of
* some other node. */
- handle = ocfs2_start_trans(osb, 1);
+ handle = ocfs2_start_trans(osb,
+ OCFS2_LOCAL_QINFO_WRITE_CREDITS);
if (IS_ERR(handle)) {
status = PTR_ERR(handle);
mlog_errno(status);
@@ -940,7 +943,7 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
struct ocfs2_local_disk_chunk *dchunk;
int status;
handle_t *handle;
- struct buffer_head *bh = NULL;
+ struct buffer_head *bh = NULL, *dbh = NULL;
u64 p_blkno;
/* We are protected by dqio_sem so no locking needed */
@@ -964,32 +967,35 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
mlog_errno(status);
goto out;
}
+ /* Local quota info and two new blocks we initialize */
+ handle = ocfs2_start_trans(OCFS2_SB(sb),
+ OCFS2_LOCAL_QINFO_WRITE_CREDITS +
+ 2 * OCFS2_QUOTA_BLOCK_UPDATE_CREDITS);
+ if (IS_ERR(handle)) {
+ status = PTR_ERR(handle);
+ mlog_errno(status);
+ goto out;
+ }
+ /* Initialize chunk header */
down_read(&OCFS2_I(lqinode)->ip_alloc_sem);
status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks,
&p_blkno, NULL, NULL);
up_read(&OCFS2_I(lqinode)->ip_alloc_sem);
if (status < 0) {
mlog_errno(status);
- goto out;
+ goto out_trans;
}
bh = sb_getblk(sb, p_blkno);
if (!bh) {
status = -ENOMEM;
mlog_errno(status);
- goto out;
+ goto out_trans;
}
dchunk = (struct ocfs2_local_disk_chunk *)bh->b_data;
-
- handle = ocfs2_start_trans(OCFS2_SB(sb), 2);
- if (IS_ERR(handle)) {
- status = PTR_ERR(handle);
- mlog_errno(status);
- goto out;
- }
-
+ ocfs2_set_new_buffer_uptodate(lqinode, bh);
status = ocfs2_journal_access_dq(handle, lqinode, bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ OCFS2_JOURNAL_ACCESS_CREATE);
if (status < 0) {
mlog_errno(status);
goto out_trans;
@@ -999,7 +1005,6 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
memset(dchunk->dqc_bitmap, 0,
sb->s_blocksize - sizeof(struct ocfs2_local_disk_chunk) -
OCFS2_QBLK_RESERVED_SPACE);
- set_buffer_uptodate(bh);
unlock_buffer(bh);
status = ocfs2_journal_dirty(handle, bh);
if (status < 0) {
@@ -1007,6 +1012,38 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
goto out_trans;
}
+ /* Initialize new block with structures */
+ down_read(&OCFS2_I(lqinode)->ip_alloc_sem);
+ status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks + 1,
+ &p_blkno, NULL, NULL);
+ up_read(&OCFS2_I(lqinode)->ip_alloc_sem);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out_trans;
+ }
+ dbh = sb_getblk(sb, p_blkno);
+ if (!dbh) {
+ status = -ENOMEM;
+ mlog_errno(status);
+ goto out_trans;
+ }
+ ocfs2_set_new_buffer_uptodate(lqinode, dbh);
+ status = ocfs2_journal_access_dq(handle, lqinode, dbh,
+ OCFS2_JOURNAL_ACCESS_CREATE);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out_trans;
+ }
+ lock_buffer(dbh);
+ memset(dbh->b_data, 0, sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE);
+ unlock_buffer(dbh);
+ status = ocfs2_journal_dirty(handle, dbh);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out_trans;
+ }
+
+ /* Update local quotafile info */
oinfo->dqi_blocks += 2;
oinfo->dqi_chunks++;
status = ocfs2_local_write_info(sb, type);
@@ -1031,6 +1068,7 @@ out_trans:
ocfs2_commit_trans(OCFS2_SB(sb), handle);
out:
brelse(bh);
+ brelse(dbh);
kmem_cache_free(ocfs2_qf_chunk_cachep, chunk);
return ERR_PTR(status);
}
@@ -1048,6 +1086,8 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
struct ocfs2_local_disk_chunk *dchunk;
int epb = ol_quota_entries_per_block(sb);
unsigned int chunk_blocks;
+ struct buffer_head *bh;
+ u64 p_blkno;
int status;
handle_t *handle;
@@ -1075,12 +1115,49 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
mlog_errno(status);
goto out;
}
- handle = ocfs2_start_trans(OCFS2_SB(sb), 2);
+
+ /* Get buffer from the just added block */
+ down_read(&OCFS2_I(lqinode)->ip_alloc_sem);
+ status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks,
+ &p_blkno, NULL, NULL);
+ up_read(&OCFS2_I(lqinode)->ip_alloc_sem);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out;
+ }
+ bh = sb_getblk(sb, p_blkno);
+ if (!bh) {
+ status = -ENOMEM;
+ mlog_errno(status);
+ goto out;
+ }
+ ocfs2_set_new_buffer_uptodate(lqinode, bh);
+
+ /* Local quota info, chunk header and the new block we initialize */
+ handle = ocfs2_start_trans(OCFS2_SB(sb),
+ OCFS2_LOCAL_QINFO_WRITE_CREDITS +
+ 2 * OCFS2_QUOTA_BLOCK_UPDATE_CREDITS);
if (IS_ERR(handle)) {
status = PTR_ERR(handle);
mlog_errno(status);
goto out;
}
+ /* Zero created block */
+ status = ocfs2_journal_access_dq(handle, lqinode, bh,
+ OCFS2_JOURNAL_ACCESS_CREATE);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out_trans;
+ }
+ lock_buffer(bh);
+ memset(bh->b_data, 0, sb->s_blocksize);
+ unlock_buffer(bh);
+ status = ocfs2_journal_dirty(handle, bh);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out_trans;
+ }
+ /* Update chunk header */
status = ocfs2_journal_access_dq(handle, lqinode, chunk->qc_headerbh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
@@ -1097,6 +1174,7 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
mlog_errno(status);
goto out_trans;
}
+ /* Update file header */
oinfo->dqi_blocks++;
status = ocfs2_local_write_info(sb, type);
if (status < 0) {
diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c
index 3f661376a2de..e49c41050264 100644
--- a/fs/ocfs2/stack_o2cb.c
+++ b/fs/ocfs2/stack_o2cb.c
@@ -17,6 +17,7 @@
* General Public License for more details.
*/
+#include <linux/kernel.h>
#include <linux/crc32.h>
#include <linux/module.h>
@@ -153,7 +154,7 @@ static int status_map[] = {
static int dlm_status_to_errno(enum dlm_status status)
{
- BUG_ON(status > (sizeof(status_map) / sizeof(status_map[0])));
+ BUG_ON(status < 0 || status >= ARRAY_SIZE(status_map));
return status_map[status];
}
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 7efb349fb9bd..faca4720aa47 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -777,6 +777,7 @@ static int ocfs2_sb_probe(struct super_block *sb,
}
di = (struct ocfs2_dinode *) (*bh)->b_data;
memset(stats, 0, sizeof(struct ocfs2_blockcheck_stats));
+ spin_lock_init(&stats->b_lock);
status = ocfs2_verify_volume(di, *bh, blksize, stats);
if (status >= 0)
goto bail;
@@ -964,7 +965,7 @@ static int ocfs2_quota_off(struct super_block *sb, int type, int remount)
return vfs_quota_disable(sb, type, DQUOT_LIMITS_ENABLED);
}
-static struct quotactl_ops ocfs2_quotactl_ops = {
+static const struct quotactl_ops ocfs2_quotactl_ops = {
.quota_on = ocfs2_quota_on,
.quota_off = ocfs2_quota_off,
.quota_sync = vfs_quota_sync,
@@ -1182,7 +1183,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
wake_up(&osb->osb_mount_event);
/* Start this when the mount is almost sure of being successful */
- ocfs2_orphan_scan_init(osb);
+ ocfs2_orphan_scan_start(osb);
mlog_exit(status);
return status;
@@ -1213,14 +1214,31 @@ static int ocfs2_get_sb(struct file_system_type *fs_type,
mnt);
}
+static void ocfs2_kill_sb(struct super_block *sb)
+{
+ struct ocfs2_super *osb = OCFS2_SB(sb);
+
+ /* Failed mount? */
+ if (!osb || atomic_read(&osb->vol_state) == VOLUME_DISABLED)
+ goto out;
+
+ /* Prevent further queueing of inode drop events */
+ spin_lock(&dentry_list_lock);
+ ocfs2_set_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED);
+ spin_unlock(&dentry_list_lock);
+ /* Wait for work to finish and/or remove it */
+ cancel_work_sync(&osb->dentry_lock_work);
+out:
+ kill_block_super(sb);
+}
+
static struct file_system_type ocfs2_fs_type = {
.owner = THIS_MODULE,
.name = "ocfs2",
.get_sb = ocfs2_get_sb, /* is this called when we mount
* the fs? */
- .kill_sb = kill_block_super, /* set to the generic one
- * right now, but do we
- * need to change that? */
+ .kill_sb = ocfs2_kill_sb,
+
.fs_flags = FS_REQUIRES_DEV|FS_RENAME_DOES_D_MOVE,
.next = NULL
};
@@ -1819,6 +1837,12 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
debugfs_remove(osb->osb_ctxt);
+ /*
+ * Flush inode dropping work queue so that deletes are
+ * performed while the filesystem is still working
+ */
+ ocfs2_drop_all_dl_inodes(osb);
+
/* Orphan scan should be stopped as early as possible */
ocfs2_orphan_scan_stop(osb);
@@ -1981,6 +2005,8 @@ static int ocfs2_initialize_super(struct super_block *sb,
snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u",
MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
+ ocfs2_orphan_scan_init(osb);
+
status = ocfs2_recovery_init(osb);
if (status) {
mlog(ML_ERROR, "Unable to initialize recovery state\n");
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index ba320e250747..d1a27cda984f 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -1052,7 +1052,8 @@ static int ocfs2_xattr_block_get(struct inode *inode,
struct ocfs2_xattr_block *xb;
struct ocfs2_xattr_value_root *xv;
size_t size;
- int ret = -ENODATA, name_offset, name_len, block_off, i;
+ int ret = -ENODATA, name_offset, name_len, i;
+ int uninitialized_var(block_off);
xs->bucket = ocfs2_xattr_bucket_new(inode);
if (!xs->bucket) {
diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c
index c7275cfbdcfb..3680bae335b5 100644
--- a/fs/omfs/dir.c
+++ b/fs/omfs/dir.c
@@ -489,7 +489,7 @@ out:
return ret;
}
-struct inode_operations omfs_dir_inops = {
+const struct inode_operations omfs_dir_inops = {
.lookup = omfs_lookup,
.mkdir = omfs_mkdir,
.rename = omfs_rename,
diff --git a/fs/omfs/file.c b/fs/omfs/file.c
index d17e774eaf45..4845fbb18e6e 100644
--- a/fs/omfs/file.c
+++ b/fs/omfs/file.c
@@ -333,11 +333,11 @@ struct file_operations omfs_file_operations = {
.splice_read = generic_file_splice_read,
};
-struct inode_operations omfs_file_inops = {
+const struct inode_operations omfs_file_inops = {
.truncate = omfs_truncate
};
-struct address_space_operations omfs_aops = {
+const struct address_space_operations omfs_aops = {
.readpage = omfs_readpage,
.readpages = omfs_readpages,
.writepage = omfs_writepage,
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index 379ae5fb4411..f3b7c1541f3a 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -278,7 +278,7 @@ static int omfs_statfs(struct dentry *dentry, struct kstatfs *buf)
return 0;
}
-static struct super_operations omfs_sops = {
+static const struct super_operations omfs_sops = {
.write_inode = omfs_write_inode,
.delete_inode = omfs_delete_inode,
.put_super = omfs_put_super,
diff --git a/fs/omfs/omfs.h b/fs/omfs/omfs.h
index 2bc0f0670406..df71039945ac 100644
--- a/fs/omfs/omfs.h
+++ b/fs/omfs/omfs.h
@@ -45,15 +45,15 @@ extern int omfs_clear_range(struct super_block *sb, u64 block, int count);
/* dir.c */
extern struct file_operations omfs_dir_operations;
-extern struct inode_operations omfs_dir_inops;
+extern const struct inode_operations omfs_dir_inops;
extern int omfs_make_empty(struct inode *inode, struct super_block *sb);
extern int omfs_is_bad(struct omfs_sb_info *sbi, struct omfs_header *header,
u64 fsblock);
/* file.c */
extern struct file_operations omfs_file_operations;
-extern struct inode_operations omfs_file_inops;
-extern struct address_space_operations omfs_aops;
+extern const struct inode_operations omfs_file_inops;
+extern const struct address_space_operations omfs_aops;
extern void omfs_make_empty_table(struct buffer_head *bh, int offset);
extern int omfs_shrink_inode(struct inode *inode);
diff --git a/fs/open.c b/fs/open.c
index dd98e8076024..31191bf513e4 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -199,7 +199,7 @@ out:
int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
struct file *filp)
{
- int err;
+ int ret;
struct iattr newattrs;
/* Not pretty: "inode->i_size" shouldn't really be signed. But it is. */
@@ -214,12 +214,14 @@ int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
}
/* Remove suid/sgid on truncate too */
- newattrs.ia_valid |= should_remove_suid(dentry);
+ ret = should_remove_suid(dentry);
+ if (ret)
+ newattrs.ia_valid |= ret | ATTR_FORCE;
mutex_lock(&dentry->d_inode->i_mutex);
- err = notify_change(dentry, &newattrs);
+ ret = notify_change(dentry, &newattrs);
mutex_unlock(&dentry->d_inode->i_mutex);
- return err;
+ return ret;
}
static long do_sys_truncate(const char __user *pathname, loff_t length)
@@ -957,6 +959,8 @@ struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags,
int error;
struct file *f;
+ validate_creds(cred);
+
/*
* We must always pass in a valid mount pointer. Historically
* callers got away with not passing it, but we must enforce this at
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index ea4e6cb29e13..7b685e10cbad 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -248,11 +248,19 @@ ssize_t part_stat_show(struct device *dev,
part_stat_read(p, merges[WRITE]),
(unsigned long long)part_stat_read(p, sectors[WRITE]),
jiffies_to_msecs(part_stat_read(p, ticks[WRITE])),
- p->in_flight,
+ part_in_flight(p),
jiffies_to_msecs(part_stat_read(p, io_ticks)),
jiffies_to_msecs(part_stat_read(p, time_in_queue)));
}
+ssize_t part_inflight_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct hd_struct *p = dev_to_part(dev);
+
+ return sprintf(buf, "%8u %8u\n", p->in_flight[0], p->in_flight[1]);
+}
+
#ifdef CONFIG_FAIL_MAKE_REQUEST
ssize_t part_fail_show(struct device *dev,
struct device_attribute *attr, char *buf)
@@ -281,6 +289,7 @@ static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL);
static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
static DEVICE_ATTR(alignment_offset, S_IRUGO, part_alignment_offset_show, NULL);
static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
+static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL);
#ifdef CONFIG_FAIL_MAKE_REQUEST
static struct device_attribute dev_attr_fail =
__ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store);
@@ -292,6 +301,7 @@ static struct attribute *part_attrs[] = {
&dev_attr_size.attr,
&dev_attr_alignment_offset.attr,
&dev_attr_stat.attr,
+ &dev_attr_inflight.attr,
#ifdef CONFIG_FAIL_MAKE_REQUEST
&dev_attr_fail.attr,
#endif
@@ -302,7 +312,7 @@ static struct attribute_group part_attr_group = {
.attrs = part_attrs,
};
-static struct attribute_group *part_attr_groups[] = {
+static const struct attribute_group *part_attr_groups[] = {
&part_attr_group,
#ifdef CONFIG_BLK_DEV_IO_TRACE
&blk_trace_attr_group,
@@ -571,7 +581,7 @@ try_scan:
}
if (from + size > get_capacity(disk)) {
- struct block_device_operations *bdops = disk->fops;
+ const struct block_device_operations *bdops = disk->fops;
unsigned long long capacity;
printk(KERN_WARNING
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 3ce5ae9e3d2d..55c4c805a756 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -234,23 +234,20 @@ static int check_mem_permission(struct task_struct *task)
struct mm_struct *mm_for_maps(struct task_struct *task)
{
- struct mm_struct *mm = get_task_mm(task);
- if (!mm)
+ struct mm_struct *mm;
+
+ if (mutex_lock_killable(&task->cred_guard_mutex))
return NULL;
- down_read(&mm->mmap_sem);
- task_lock(task);
- if (task->mm != mm)
- goto out;
- if (task->mm != current->mm &&
- __ptrace_may_access(task, PTRACE_MODE_READ) < 0)
- goto out;
- task_unlock(task);
+
+ mm = get_task_mm(task);
+ if (mm && mm != current->mm &&
+ !ptrace_may_access(task, PTRACE_MODE_READ)) {
+ mmput(mm);
+ mm = NULL;
+ }
+ mutex_unlock(&task->cred_guard_mutex);
+
return mm;
-out:
- task_unlock(task);
- up_read(&mm->mmap_sem);
- mmput(mm);
- return NULL;
}
static int proc_pid_cmdline(struct task_struct *task, char * buffer)
@@ -450,7 +447,7 @@ static int proc_oom_score(struct task_struct *task, char *buffer)
do_posix_clock_monotonic_gettime(&uptime);
read_lock(&tasklist_lock);
- points = badness(task, uptime.tv_sec);
+ points = badness(task->group_leader, uptime.tv_sec);
read_unlock(&tasklist_lock);
return sprintf(buffer, "%lu\n", points);
}
@@ -1002,16 +999,17 @@ static ssize_t oom_adjust_read(struct file *file, char __user *buf,
struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
char buffer[PROC_NUMBUF];
size_t len;
- int oom_adjust;
+ int oom_adjust = OOM_DISABLE;
+ unsigned long flags;
if (!task)
return -ESRCH;
- task_lock(task);
- if (task->mm)
- oom_adjust = task->mm->oom_adj;
- else
- oom_adjust = OOM_DISABLE;
- task_unlock(task);
+
+ if (lock_task_sighand(task, &flags)) {
+ oom_adjust = task->signal->oom_adj;
+ unlock_task_sighand(task, &flags);
+ }
+
put_task_struct(task);
len = snprintf(buffer, sizeof(buffer), "%i\n", oom_adjust);
@@ -1023,40 +1021,44 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
struct task_struct *task;
- char buffer[PROC_NUMBUF], *end;
- int oom_adjust;
+ char buffer[PROC_NUMBUF];
+ long oom_adjust;
+ unsigned long flags;
+ int err;
memset(buffer, 0, sizeof(buffer));
if (count > sizeof(buffer) - 1)
count = sizeof(buffer) - 1;
if (copy_from_user(buffer, buf, count))
return -EFAULT;
- oom_adjust = simple_strtol(buffer, &end, 0);
+
+ err = strict_strtol(strstrip(buffer), 0, &oom_adjust);
+ if (err)
+ return -EINVAL;
if ((oom_adjust < OOM_ADJUST_MIN || oom_adjust > OOM_ADJUST_MAX) &&
oom_adjust != OOM_DISABLE)
return -EINVAL;
- if (*end == '\n')
- end++;
+
task = get_proc_task(file->f_path.dentry->d_inode);
if (!task)
return -ESRCH;
- task_lock(task);
- if (!task->mm) {
- task_unlock(task);
+ if (!lock_task_sighand(task, &flags)) {
put_task_struct(task);
- return -EINVAL;
+ return -ESRCH;
}
- if (oom_adjust < task->mm->oom_adj && !capable(CAP_SYS_RESOURCE)) {
- task_unlock(task);
+
+ if (oom_adjust < task->signal->oom_adj && !capable(CAP_SYS_RESOURCE)) {
+ unlock_task_sighand(task, &flags);
put_task_struct(task);
return -EACCES;
}
- task->mm->oom_adj = oom_adjust;
- task_unlock(task);
+
+ task->signal->oom_adj = oom_adjust;
+
+ unlock_task_sighand(task, &flags);
put_task_struct(task);
- if (end - buffer == 0)
- return -EIO;
- return end - buffer;
+
+ return count;
}
static const struct file_operations proc_oom_adjust_operations = {
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 59b43a068872..f06f45b42181 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -328,43 +328,12 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
return -EFAULT;
} else if (is_vmalloc_addr((void *)start)) {
char * elf_buf;
- struct vm_struct *m;
- unsigned long curstart = start;
- unsigned long cursize = tsz;
elf_buf = kzalloc(tsz, GFP_KERNEL);
if (!elf_buf)
return -ENOMEM;
-
- read_lock(&vmlist_lock);
- for (m=vmlist; m && cursize; m=m->next) {
- unsigned long vmstart;
- unsigned long vmsize;
- unsigned long msize = m->size - PAGE_SIZE;
-
- if (((unsigned long)m->addr + msize) <
- curstart)
- continue;
- if ((unsigned long)m->addr > (curstart +
- cursize))
- break;
- vmstart = (curstart < (unsigned long)m->addr ?
- (unsigned long)m->addr : curstart);
- if (((unsigned long)m->addr + msize) >
- (curstart + cursize))
- vmsize = curstart + cursize - vmstart;
- else
- vmsize = (unsigned long)m->addr +
- msize - vmstart;
- curstart = vmstart + vmsize;
- cursize -= vmsize;
- /* don't dump ioremap'd stuff! (TA) */
- if (m->flags & VM_IOREMAP)
- continue;
- memcpy(elf_buf + (vmstart - start),
- (char *)vmstart, vmsize);
- }
- read_unlock(&vmlist_lock);
+ vread(elf_buf, (char *)start, tsz);
+ /* we have to zero-fill user buffer even if no read */
if (copy_to_user(buffer, elf_buf, tsz)) {
kfree(elf_buf);
return -EFAULT;
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index d5c410d47fae..171e052c07b3 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -81,9 +81,11 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
"Writeback: %8lu kB\n"
"AnonPages: %8lu kB\n"
"Mapped: %8lu kB\n"
+ "Shmem: %8lu kB\n"
"Slab: %8lu kB\n"
"SReclaimable: %8lu kB\n"
"SUnreclaim: %8lu kB\n"
+ "KernelStack: %8lu kB\n"
"PageTables: %8lu kB\n"
#ifdef CONFIG_QUICKLIST
"Quicklists: %8lu kB\n"
@@ -124,10 +126,12 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
K(global_page_state(NR_WRITEBACK)),
K(global_page_state(NR_ANON_PAGES)),
K(global_page_state(NR_FILE_MAPPED)),
+ K(global_page_state(NR_SHMEM)),
K(global_page_state(NR_SLAB_RECLAIMABLE) +
global_page_state(NR_SLAB_UNRECLAIMABLE)),
K(global_page_state(NR_SLAB_RECLAIMABLE)),
K(global_page_state(NR_SLAB_UNRECLAIMABLE)),
+ global_page_state(NR_KERNEL_STACK) * THREAD_SIZE / 1024,
K(global_page_state(NR_PAGETABLE)),
#ifdef CONFIG_QUICKLIST
K(quicklist_total_size()),
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 2707c6c7a20f..2281c2cbfe2b 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -2,6 +2,7 @@
#include <linux/compiler.h>
#include <linux/fs.h>
#include <linux/init.h>
+#include <linux/ksm.h>
#include <linux/mm.h>
#include <linux/mmzone.h>
#include <linux/proc_fs.h>
@@ -95,6 +96,8 @@ static const struct file_operations proc_kpagecount_operations = {
#define KPF_UNEVICTABLE 18
#define KPF_NOPAGE 20
+#define KPF_KSM 21
+
/* kernel hacking assistances
* WARNING: subject to change, never rely on them!
*/
@@ -137,6 +140,8 @@ static u64 get_uflags(struct page *page)
u |= 1 << KPF_MMAP;
if (PageAnon(page))
u |= 1 << KPF_ANON;
+ if (PageKsm(page))
+ u |= 1 << KPF_KSM;
/*
* compound pages: export both head/tail info
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 6f61b7cc32e0..59e98fea34a4 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -119,6 +119,7 @@ static void *m_start(struct seq_file *m, loff_t *pos)
mm = mm_for_maps(priv->task);
if (!mm)
return NULL;
+ down_read(&mm->mmap_sem);
tail_vma = get_gate_vma(priv->task);
priv->tail_vma = tail_vma;
@@ -464,6 +465,10 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
return 0;
}
+#define CLEAR_REFS_ALL 1
+#define CLEAR_REFS_ANON 2
+#define CLEAR_REFS_MAPPED 3
+
static ssize_t clear_refs_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
@@ -471,13 +476,15 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
char buffer[PROC_NUMBUF], *end;
struct mm_struct *mm;
struct vm_area_struct *vma;
+ int type;
memset(buffer, 0, sizeof(buffer));
if (count > sizeof(buffer) - 1)
count = sizeof(buffer) - 1;
if (copy_from_user(buffer, buf, count))
return -EFAULT;
- if (!simple_strtol(buffer, &end, 0))
+ type = simple_strtol(buffer, &end, 0);
+ if (type < CLEAR_REFS_ALL || type > CLEAR_REFS_MAPPED)
return -EINVAL;
if (*end == '\n')
end++;
@@ -493,9 +500,23 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
down_read(&mm->mmap_sem);
for (vma = mm->mmap; vma; vma = vma->vm_next) {
clear_refs_walk.private = vma;
- if (!is_vm_hugetlb_page(vma))
- walk_page_range(vma->vm_start, vma->vm_end,
- &clear_refs_walk);
+ if (is_vm_hugetlb_page(vma))
+ continue;
+ /*
+ * Writing 1 to /proc/pid/clear_refs affects all pages.
+ *
+ * Writing 2 to /proc/pid/clear_refs only affects
+ * Anonymous pages.
+ *
+ * Writing 3 to /proc/pid/clear_refs only affects file
+ * mapped pages.
+ */
+ if (type == CLEAR_REFS_ANON && vma->vm_file)
+ continue;
+ if (type == CLEAR_REFS_MAPPED && !vma->vm_file)
+ continue;
+ walk_page_range(vma->vm_start, vma->vm_end,
+ &clear_refs_walk);
}
flush_tlb_mm(mm);
up_read(&mm->mmap_sem);
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 64a72e2e7650..8f5c05d3dbd3 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -189,6 +189,7 @@ static void *m_start(struct seq_file *m, loff_t *pos)
priv->task = NULL;
return NULL;
}
+ down_read(&mm->mmap_sem);
/* start from the Nth VMA */
for (p = rb_first(&mm->mm_rb); p; p = rb_next(p))
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 38f7bd559f35..39b49c42a7ed 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -1839,7 +1839,7 @@ EXPORT_SYMBOL(dquot_commit_info);
/*
* Definitions of diskquota operations.
*/
-struct dquot_operations dquot_operations = {
+const struct dquot_operations dquot_operations = {
.initialize = dquot_initialize,
.drop = dquot_drop,
.alloc_space = dquot_alloc_space,
@@ -2461,7 +2461,7 @@ out:
}
EXPORT_SYMBOL(vfs_set_dqinfo);
-struct quotactl_ops vfs_quotactl_ops = {
+const struct quotactl_ops vfs_quotactl_ops = {
.quota_on = vfs_quota_on,
.quota_off = vfs_quota_off,
.quota_sync = vfs_quota_sync,
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index 0ff7566c767c..a7f0110fca4c 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -46,6 +46,7 @@ static const struct super_operations ramfs_ops;
static const struct inode_operations ramfs_dir_inode_operations;
static struct backing_dev_info ramfs_backing_dev_info = {
+ .name = "ramfs",
.ra_pages = 0, /* No readahead */
.capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK |
BDI_CAP_MAP_DIRECT | BDI_CAP_MAP_COPY |
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 7adea74d6a8a..f0ad05f38022 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -612,7 +612,7 @@ static int reiserfs_mark_dquot_dirty(struct dquot *);
static int reiserfs_write_info(struct super_block *, int);
static int reiserfs_quota_on(struct super_block *, int, int, char *, int);
-static struct dquot_operations reiserfs_quota_operations = {
+static const struct dquot_operations reiserfs_quota_operations = {
.initialize = dquot_initialize,
.drop = dquot_drop,
.alloc_space = dquot_alloc_space,
@@ -629,7 +629,7 @@ static struct dquot_operations reiserfs_quota_operations = {
.destroy_dquot = dquot_destroy,
};
-static struct quotactl_ops reiserfs_qctl_operations = {
+static const struct quotactl_ops reiserfs_qctl_operations = {
.quota_on = reiserfs_quota_on,
.quota_off = vfs_quota_off,
.quota_sync = vfs_quota_sync,
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index 4ab3c03d8f95..47f132df0c3f 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -284,7 +284,7 @@ static const struct file_operations romfs_dir_operations = {
.readdir = romfs_readdir,
};
-static struct inode_operations romfs_dir_inode_operations = {
+static const struct inode_operations romfs_dir_inode_operations = {
.lookup = romfs_lookup,
};
diff --git a/fs/select.c b/fs/select.c
index d870237e42c7..8084834e123e 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -110,6 +110,7 @@ void poll_initwait(struct poll_wqueues *pwq)
{
init_poll_funcptr(&pwq->pt, __pollwait);
pwq->polling_task = current;
+ pwq->triggered = 0;
pwq->error = 0;
pwq->table = NULL;
pwq->inline_index = 0;
diff --git a/fs/splice.c b/fs/splice.c
index 73766d24f97b..7394e9e17534 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -502,8 +502,10 @@ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
len = left;
ret = __generic_file_splice_read(in, ppos, pipe, len, flags);
- if (ret > 0)
+ if (ret > 0) {
*ppos += ret;
+ file_accessed(in);
+ }
return ret;
}
@@ -963,8 +965,10 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
ret = file_remove_suid(out);
- if (!ret)
+ if (!ret) {
+ file_update_time(out);
ret = splice_from_pipe_feed(pipe, &sd, pipe_to_file);
+ }
mutex_unlock(&inode->i_mutex);
} while (ret > 0);
splice_from_pipe_end(pipe, &sd);
@@ -976,25 +980,15 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
if (ret > 0) {
unsigned long nr_pages;
+ int err;
- *ppos += ret;
nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
- /*
- * If file or inode is SYNC and we actually wrote some data,
- * sync it.
- */
- if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) {
- int err;
-
- mutex_lock(&inode->i_mutex);
- err = generic_osync_inode(inode, mapping,
- OSYNC_METADATA|OSYNC_DATA);
- mutex_unlock(&inode->i_mutex);
-
- if (err)
- ret = err;
- }
+ err = generic_write_sync(out, *ppos, ret);
+ if (err)
+ ret = err;
+ else
+ *ppos += ret;
balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
}
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index cb5fc57e370b..6c197ef53add 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -44,7 +44,7 @@
#include "squashfs.h"
static struct file_system_type squashfs_fs_type;
-static struct super_operations squashfs_super_ops;
+static const struct super_operations squashfs_super_ops;
static int supported_squashfs_filesystem(short major, short minor, short comp)
{
@@ -444,7 +444,7 @@ static struct file_system_type squashfs_fs_type = {
.fs_flags = FS_REQUIRES_DEV
};
-static struct super_operations squashfs_super_ops = {
+static const struct super_operations squashfs_super_ops = {
.alloc_inode = squashfs_alloc_inode,
.destroy_inode = squashfs_destroy_inode,
.statfs = squashfs_statfs,
diff --git a/fs/super.c b/fs/super.c
index 2761d3e22ed9..0e7207b9815c 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -54,7 +54,7 @@ DEFINE_SPINLOCK(sb_lock);
static struct super_block *alloc_super(struct file_system_type *type)
{
struct super_block *s = kzalloc(sizeof(struct super_block), GFP_USER);
- static struct super_operations default_op;
+ static const struct super_operations default_op;
if (s) {
if (security_sb_alloc(s)) {
@@ -62,9 +62,6 @@ static struct super_block *alloc_super(struct file_system_type *type)
s = NULL;
goto out;
}
- INIT_LIST_HEAD(&s->s_dirty);
- INIT_LIST_HEAD(&s->s_io);
- INIT_LIST_HEAD(&s->s_more_io);
INIT_LIST_HEAD(&s->s_files);
INIT_LIST_HEAD(&s->s_instances);
INIT_HLIST_HEAD(&s->s_anon);
@@ -171,7 +168,7 @@ int __put_super_and_need_restart(struct super_block *sb)
* Drops a temporary reference, frees superblock if there's no
* references left.
*/
-static void put_super(struct super_block *sb)
+void put_super(struct super_block *sb)
{
spin_lock(&sb_lock);
__put_super(sb);
@@ -710,6 +707,12 @@ static int set_bdev_super(struct super_block *s, void *data)
{
s->s_bdev = data;
s->s_dev = s->s_bdev->bd_dev;
+
+ /*
+ * We set the bdi here to the queue backing, file systems can
+ * overwrite this in ->fill_super()
+ */
+ s->s_bdi = &bdev_get_queue(s->s_bdev)->backing_dev_info;
return 0;
}
diff --git a/fs/sync.c b/fs/sync.c
index 3422ba61d86d..c08467a5d7cb 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -19,20 +19,29 @@
SYNC_FILE_RANGE_WAIT_AFTER)
/*
- * Do the filesystem syncing work. For simple filesystems sync_inodes_sb(sb, 0)
- * just dirties buffers with inodes so we have to submit IO for these buffers
- * via __sync_blockdev(). This also speeds up the wait == 1 case since in that
- * case write_inode() functions do sync_dirty_buffer() and thus effectively
- * write one block at a time.
+ * Do the filesystem syncing work. For simple filesystems
+ * writeback_inodes_sb(sb) just dirties buffers with inodes so we have to
+ * submit IO for these buffers via __sync_blockdev(). This also speeds up the
+ * wait == 1 case since in that case write_inode() functions do
+ * sync_dirty_buffer() and thus effectively write one block at a time.
*/
static int __sync_filesystem(struct super_block *sb, int wait)
{
+ /*
+ * This should be safe, as we require bdi backing to actually
+ * write out data in the first place
+ */
+ if (!sb->s_bdi)
+ return 0;
+
/* Avoid doing twice syncing and cache pruning for quota sync */
- if (!wait)
+ if (!wait) {
writeout_quota_sb(sb, -1);
- else
+ writeback_inodes_sb(sb);
+ } else {
sync_quota_sb(sb, -1);
- sync_inodes_sb(sb, wait);
+ sync_inodes_sb(sb);
+ }
if (sb->s_op->sync_fs)
sb->s_op->sync_fs(sb, wait);
return __sync_blockdev(sb->s_bdev, wait);
@@ -99,7 +108,7 @@ restart:
spin_unlock(&sb_lock);
down_read(&sb->s_umount);
- if (!(sb->s_flags & MS_RDONLY) && sb->s_root)
+ if (!(sb->s_flags & MS_RDONLY) && sb->s_root && sb->s_bdi)
__sync_filesystem(sb, wait);
up_read(&sb->s_umount);
@@ -118,7 +127,7 @@ restart:
*/
SYSCALL_DEFINE0(sync)
{
- wakeup_pdflush(0);
+ wakeup_flusher_threads(0);
sync_filesystems(0);
sync_filesystems(1);
if (unlikely(laptop_mode))
@@ -176,19 +185,23 @@ int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
}
/**
- * vfs_fsync - perform a fsync or fdatasync on a file
+ * vfs_fsync_range - helper to sync a range of data & metadata to disk
* @file: file to sync
* @dentry: dentry of @file
- * @data: only perform a fdatasync operation
+ * @start: offset in bytes of the beginning of data range to sync
+ * @end: offset in bytes of the end of data range (inclusive)
+ * @datasync: perform only datasync
*
- * Write back data and metadata for @file to disk. If @datasync is
- * set only metadata needed to access modified file data is written.
+ * Write back data in range @start..@end and metadata for @file to disk. If
+ * @datasync is set only metadata needed to access modified file data is
+ * written.
*
* In case this function is called from nfsd @file may be %NULL and
* only @dentry is set. This can only happen when the filesystem
* implements the export_operations API.
*/
-int vfs_fsync(struct file *file, struct dentry *dentry, int datasync)
+int vfs_fsync_range(struct file *file, struct dentry *dentry, loff_t start,
+ loff_t end, int datasync)
{
const struct file_operations *fop;
struct address_space *mapping;
@@ -212,7 +225,7 @@ int vfs_fsync(struct file *file, struct dentry *dentry, int datasync)
goto out;
}
- ret = filemap_fdatawrite(mapping);
+ ret = filemap_write_and_wait_range(mapping, start, end);
/*
* We need to protect against concurrent writers, which could cause
@@ -223,12 +236,29 @@ int vfs_fsync(struct file *file, struct dentry *dentry, int datasync)
if (!ret)
ret = err;
mutex_unlock(&mapping->host->i_mutex);
- err = filemap_fdatawait(mapping);
- if (!ret)
- ret = err;
+
out:
return ret;
}
+EXPORT_SYMBOL(vfs_fsync_range);
+
+/**
+ * vfs_fsync - perform a fsync or fdatasync on a file
+ * @file: file to sync
+ * @dentry: dentry of @file
+ * @datasync: only perform a fdatasync operation
+ *
+ * Write back data and metadata for @file to disk. If @datasync is
+ * set only metadata needed to access modified file data is written.
+ *
+ * In case this function is called from nfsd @file may be %NULL and
+ * only @dentry is set. This can only happen when the filesystem
+ * implements the export_operations API.
+ */
+int vfs_fsync(struct file *file, struct dentry *dentry, int datasync)
+{
+ return vfs_fsync_range(file, dentry, 0, LLONG_MAX, datasync);
+}
EXPORT_SYMBOL(vfs_fsync);
static int do_fsync(unsigned int fd, int datasync)
@@ -254,6 +284,23 @@ SYSCALL_DEFINE1(fdatasync, unsigned int, fd)
return do_fsync(fd, 1);
}
+/**
+ * generic_write_sync - perform syncing after a write if file / inode is sync
+ * @file: file to which the write happened
+ * @pos: offset where the write started
+ * @count: length of the write
+ *
+ * This is just a simple wrapper about our general syncing function.
+ */
+int generic_write_sync(struct file *file, loff_t pos, loff_t count)
+{
+ if (!(file->f_flags & O_SYNC) && !IS_SYNC(file->f_mapping->host))
+ return 0;
+ return vfs_fsync_range(file, file->f_path.dentry, pos,
+ pos + count - 1, 1);
+}
+EXPORT_SYMBOL(generic_write_sync);
+
/*
* sys_sync_file_range() permits finely controlled syncing over a segment of
* a file in the range offset .. (offset+nbytes-1) inclusive. If nbytes is
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 14f2d71ea3ce..0050fc40e8c9 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -760,6 +760,7 @@ static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry,
const struct inode_operations sysfs_dir_inode_operations = {
.lookup = sysfs_lookup,
.setattr = sysfs_setattr,
+ .setxattr = sysfs_setxattr,
};
static void remove_dir(struct sysfs_dirent *sd)
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index 555f0ff988df..e28cecf179f5 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -18,6 +18,8 @@
#include <linux/capability.h>
#include <linux/errno.h>
#include <linux/sched.h>
+#include <linux/xattr.h>
+#include <linux/security.h>
#include "sysfs.h"
extern struct super_block * sysfs_sb;
@@ -29,12 +31,14 @@ static const struct address_space_operations sysfs_aops = {
};
static struct backing_dev_info sysfs_backing_dev_info = {
+ .name = "sysfs",
.ra_pages = 0, /* No readahead */
.capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
};
static const struct inode_operations sysfs_inode_operations ={
.setattr = sysfs_setattr,
+ .setxattr = sysfs_setxattr,
};
int __init sysfs_inode_init(void)
@@ -42,18 +46,37 @@ int __init sysfs_inode_init(void)
return bdi_init(&sysfs_backing_dev_info);
}
+struct sysfs_inode_attrs *sysfs_init_inode_attrs(struct sysfs_dirent *sd)
+{
+ struct sysfs_inode_attrs *attrs;
+ struct iattr *iattrs;
+
+ attrs = kzalloc(sizeof(struct sysfs_inode_attrs), GFP_KERNEL);
+ if (!attrs)
+ return NULL;
+ iattrs = &attrs->ia_iattr;
+
+ /* assign default attributes */
+ iattrs->ia_mode = sd->s_mode;
+ iattrs->ia_uid = 0;
+ iattrs->ia_gid = 0;
+ iattrs->ia_atime = iattrs->ia_mtime = iattrs->ia_ctime = CURRENT_TIME;
+
+ return attrs;
+}
int sysfs_setattr(struct dentry * dentry, struct iattr * iattr)
{
struct inode * inode = dentry->d_inode;
struct sysfs_dirent * sd = dentry->d_fsdata;
- struct iattr * sd_iattr;
+ struct sysfs_inode_attrs *sd_attrs;
+ struct iattr *iattrs;
unsigned int ia_valid = iattr->ia_valid;
int error;
if (!sd)
return -EINVAL;
- sd_iattr = sd->s_iattr;
+ sd_attrs = sd->s_iattr;
error = inode_change_ok(inode, iattr);
if (error)
@@ -65,42 +88,77 @@ int sysfs_setattr(struct dentry * dentry, struct iattr * iattr)
if (error)
return error;
- if (!sd_iattr) {
+ if (!sd_attrs) {
/* setting attributes for the first time, allocate now */
- sd_iattr = kzalloc(sizeof(struct iattr), GFP_KERNEL);
- if (!sd_iattr)
+ sd_attrs = sysfs_init_inode_attrs(sd);
+ if (!sd_attrs)
return -ENOMEM;
- /* assign default attributes */
- sd_iattr->ia_mode = sd->s_mode;
- sd_iattr->ia_uid = 0;
- sd_iattr->ia_gid = 0;
- sd_iattr->ia_atime = sd_iattr->ia_mtime = sd_iattr->ia_ctime = CURRENT_TIME;
- sd->s_iattr = sd_iattr;
+ sd->s_iattr = sd_attrs;
+ } else {
+ /* attributes were changed at least once in past */
+ iattrs = &sd_attrs->ia_iattr;
+
+ if (ia_valid & ATTR_UID)
+ iattrs->ia_uid = iattr->ia_uid;
+ if (ia_valid & ATTR_GID)
+ iattrs->ia_gid = iattr->ia_gid;
+ if (ia_valid & ATTR_ATIME)
+ iattrs->ia_atime = timespec_trunc(iattr->ia_atime,
+ inode->i_sb->s_time_gran);
+ if (ia_valid & ATTR_MTIME)
+ iattrs->ia_mtime = timespec_trunc(iattr->ia_mtime,
+ inode->i_sb->s_time_gran);
+ if (ia_valid & ATTR_CTIME)
+ iattrs->ia_ctime = timespec_trunc(iattr->ia_ctime,
+ inode->i_sb->s_time_gran);
+ if (ia_valid & ATTR_MODE) {
+ umode_t mode = iattr->ia_mode;
+
+ if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID))
+ mode &= ~S_ISGID;
+ iattrs->ia_mode = sd->s_mode = mode;
+ }
}
+ return error;
+}
- /* attributes were changed atleast once in past */
-
- if (ia_valid & ATTR_UID)
- sd_iattr->ia_uid = iattr->ia_uid;
- if (ia_valid & ATTR_GID)
- sd_iattr->ia_gid = iattr->ia_gid;
- if (ia_valid & ATTR_ATIME)
- sd_iattr->ia_atime = timespec_trunc(iattr->ia_atime,
- inode->i_sb->s_time_gran);
- if (ia_valid & ATTR_MTIME)
- sd_iattr->ia_mtime = timespec_trunc(iattr->ia_mtime,
- inode->i_sb->s_time_gran);
- if (ia_valid & ATTR_CTIME)
- sd_iattr->ia_ctime = timespec_trunc(iattr->ia_ctime,
- inode->i_sb->s_time_gran);
- if (ia_valid & ATTR_MODE) {
- umode_t mode = iattr->ia_mode;
-
- if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID))
- mode &= ~S_ISGID;
- sd_iattr->ia_mode = sd->s_mode = mode;
- }
+int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value,
+ size_t size, int flags)
+{
+ struct sysfs_dirent *sd = dentry->d_fsdata;
+ struct sysfs_inode_attrs *iattrs;
+ void *secdata;
+ int error;
+ u32 secdata_len = 0;
+
+ if (!sd)
+ return -EINVAL;
+ if (!sd->s_iattr)
+ sd->s_iattr = sysfs_init_inode_attrs(sd);
+ if (!sd->s_iattr)
+ return -ENOMEM;
+
+ iattrs = sd->s_iattr;
+
+ if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) {
+ const char *suffix = name + XATTR_SECURITY_PREFIX_LEN;
+ error = security_inode_setsecurity(dentry->d_inode, suffix,
+ value, size, flags);
+ if (error)
+ goto out;
+ error = security_inode_getsecctx(dentry->d_inode,
+ &secdata, &secdata_len);
+ if (error)
+ goto out;
+ if (iattrs->ia_secdata)
+ security_release_secctx(iattrs->ia_secdata,
+ iattrs->ia_secdata_len);
+ iattrs->ia_secdata = secdata;
+ iattrs->ia_secdata_len = secdata_len;
+ } else
+ return -EINVAL;
+out:
return error;
}
@@ -146,6 +204,7 @@ static int sysfs_count_nlink(struct sysfs_dirent *sd)
static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode)
{
struct bin_attribute *bin_attr;
+ struct sysfs_inode_attrs *iattrs;
inode->i_private = sysfs_get(sd);
inode->i_mapping->a_ops = &sysfs_aops;
@@ -154,16 +213,20 @@ static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode)
inode->i_ino = sd->s_ino;
lockdep_set_class(&inode->i_mutex, &sysfs_inode_imutex_key);
- if (sd->s_iattr) {
+ iattrs = sd->s_iattr;
+ if (iattrs) {
/* sysfs_dirent has non-default attributes
* get them for the new inode from persistent copy
* in sysfs_dirent
*/
- set_inode_attr(inode, sd->s_iattr);
+ set_inode_attr(inode, &iattrs->ia_iattr);
+ if (iattrs->ia_secdata)
+ security_inode_notifysecctx(inode,
+ iattrs->ia_secdata,
+ iattrs->ia_secdata_len);
} else
set_default_inode_attr(inode, sd->s_mode);
-
/* initialize inode according to type */
switch (sysfs_type(sd)) {
case SYSFS_DIR:
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index 1d897ad808e0..c5081ad77026 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -16,6 +16,7 @@
#include <linux/kobject.h>
#include <linux/namei.h>
#include <linux/mutex.h>
+#include <linux/security.h>
#include "sysfs.h"
@@ -209,6 +210,7 @@ static void sysfs_put_link(struct dentry *dentry, struct nameidata *nd, void *co
}
const struct inode_operations sysfs_symlink_inode_operations = {
+ .setxattr = sysfs_setxattr,
.readlink = generic_readlink,
.follow_link = sysfs_follow_link,
.put_link = sysfs_put_link,
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 3fa0d98481e2..af4c4e7482ac 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -8,6 +8,8 @@
* This file is released under the GPLv2.
*/
+#include <linux/fs.h>
+
struct sysfs_open_dirent;
/* type-specific structures for sysfs_dirent->s_* union members */
@@ -31,6 +33,12 @@ struct sysfs_elem_bin_attr {
struct hlist_head buffers;
};
+struct sysfs_inode_attrs {
+ struct iattr ia_iattr;
+ void *ia_secdata;
+ u32 ia_secdata_len;
+};
+
/*
* sysfs_dirent - the building block of sysfs hierarchy. Each and
* every sysfs node is represented by single sysfs_dirent.
@@ -56,7 +64,7 @@ struct sysfs_dirent {
unsigned int s_flags;
ino_t s_ino;
umode_t s_mode;
- struct iattr *s_iattr;
+ struct sysfs_inode_attrs *s_iattr;
};
#define SD_DEACTIVATED_BIAS INT_MIN
@@ -148,6 +156,8 @@ static inline void __sysfs_put(struct sysfs_dirent *sd)
struct inode *sysfs_get_inode(struct sysfs_dirent *sd);
void sysfs_delete_inode(struct inode *inode);
int sysfs_setattr(struct dentry *dentry, struct iattr *iattr);
+int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value,
+ size_t size, int flags);
int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name);
int sysfs_inode_init(void);
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index eaf6d891d46f..076ca50e9933 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -54,41 +54,15 @@
* @nr_to_write: how many dirty pages to write-back
*
* This function shrinks UBIFS liability by means of writing back some amount
- * of dirty inodes and their pages. Returns the amount of pages which were
- * written back. The returned value does not include dirty inodes which were
- * synchronized.
+ * of dirty inodes and their pages.
*
* Note, this function synchronizes even VFS inodes which are locked
* (@i_mutex) by the caller of the budgeting function, because write-back does
* not touch @i_mutex.
*/
-static int shrink_liability(struct ubifs_info *c, int nr_to_write)
+static void shrink_liability(struct ubifs_info *c, int nr_to_write)
{
- int nr_written;
- struct writeback_control wbc = {
- .sync_mode = WB_SYNC_NONE,
- .range_end = LLONG_MAX,
- .nr_to_write = nr_to_write,
- };
-
- generic_sync_sb_inodes(c->vfs_sb, &wbc);
- nr_written = nr_to_write - wbc.nr_to_write;
-
- if (!nr_written) {
- /*
- * Re-try again but wait on pages/inodes which are being
- * written-back concurrently (e.g., by pdflush).
- */
- memset(&wbc, 0, sizeof(struct writeback_control));
- wbc.sync_mode = WB_SYNC_ALL;
- wbc.range_end = LLONG_MAX;
- wbc.nr_to_write = nr_to_write;
- generic_sync_sb_inodes(c->vfs_sb, &wbc);
- nr_written = nr_to_write - wbc.nr_to_write;
- }
-
- dbg_budg("%d pages were written back", nr_written);
- return nr_written;
+ writeback_inodes_sb(c->vfs_sb);
}
/**
@@ -741,7 +715,7 @@ long long ubifs_get_free_space_nolock(struct ubifs_info *c)
* ubifs_get_free_space - return amount of free space.
* @c: UBIFS file-system description object
*
- * This function calculates and retuns amount of free space to report to
+ * This function calculates and returns amount of free space to report to
* user-space.
*/
long long ubifs_get_free_space(struct ubifs_info *c)
diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c
index f3a7945527fb..4775af401167 100644
--- a/fs/ubifs/commit.c
+++ b/fs/ubifs/commit.c
@@ -510,7 +510,7 @@ int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot)
int lnum, offs, len, err = 0, uninitialized_var(last_level), child_cnt;
int first = 1, iip;
struct ubifs_debug_info *d = c->dbg;
- union ubifs_key lower_key, upper_key, l_key, u_key;
+ union ubifs_key uninitialized_var(lower_key), upper_key, l_key, u_key;
unsigned long long uninitialized_var(last_sqnum);
struct ubifs_idx_node *idx;
struct list_head list;
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index ce2cd8343618..dbc093afd946 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -210,6 +210,20 @@ const char *dbg_cstate(int cmt_state)
}
}
+const char *dbg_jhead(int jhead)
+{
+ switch (jhead) {
+ case GCHD:
+ return "0 (GC)";
+ case BASEHD:
+ return "1 (base)";
+ case DATAHD:
+ return "2 (data)";
+ default:
+ return "unknown journal head";
+ }
+}
+
static void dump_ch(const struct ubifs_ch *ch)
{
printk(KERN_DEBUG "\tmagic %#x\n", le32_to_cpu(ch->magic));
@@ -623,8 +637,9 @@ void dbg_dump_budg(struct ubifs_info *c)
/* If we are in R/O mode, journal heads do not exist */
if (c->jheads)
for (i = 0; i < c->jhead_cnt; i++)
- printk(KERN_DEBUG "\tjhead %d\t LEB %d\n",
- c->jheads[i].wbuf.jhead, c->jheads[i].wbuf.lnum);
+ printk(KERN_DEBUG "\tjhead %s\t LEB %d\n",
+ dbg_jhead(c->jheads[i].wbuf.jhead),
+ c->jheads[i].wbuf.lnum);
for (rb = rb_first(&c->buds); rb; rb = rb_next(rb)) {
bud = rb_entry(rb, struct ubifs_bud, rb);
printk(KERN_DEBUG "\tbud LEB %d\n", bud->lnum);
@@ -648,9 +663,90 @@ void dbg_dump_budg(struct ubifs_info *c)
void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp)
{
- printk(KERN_DEBUG "LEB %d lprops: free %d, dirty %d (used %d), "
- "flags %#x\n", lp->lnum, lp->free, lp->dirty,
- c->leb_size - lp->free - lp->dirty, lp->flags);
+ int i, spc, dark = 0, dead = 0;
+ struct rb_node *rb;
+ struct ubifs_bud *bud;
+
+ spc = lp->free + lp->dirty;
+ if (spc < c->dead_wm)
+ dead = spc;
+ else
+ dark = ubifs_calc_dark(c, spc);
+
+ if (lp->flags & LPROPS_INDEX)
+ printk(KERN_DEBUG "LEB %-7d free %-8d dirty %-8d used %-8d "
+ "free + dirty %-8d flags %#x (", lp->lnum, lp->free,
+ lp->dirty, c->leb_size - spc, spc, lp->flags);
+ else
+ printk(KERN_DEBUG "LEB %-7d free %-8d dirty %-8d used %-8d "
+ "free + dirty %-8d dark %-4d dead %-4d nodes fit %-3d "
+ "flags %#-4x (", lp->lnum, lp->free, lp->dirty,
+ c->leb_size - spc, spc, dark, dead,
+ (int)(spc / UBIFS_MAX_NODE_SZ), lp->flags);
+
+ if (lp->flags & LPROPS_TAKEN) {
+ if (lp->flags & LPROPS_INDEX)
+ printk(KERN_CONT "index, taken");
+ else
+ printk(KERN_CONT "taken");
+ } else {
+ const char *s;
+
+ if (lp->flags & LPROPS_INDEX) {
+ switch (lp->flags & LPROPS_CAT_MASK) {
+ case LPROPS_DIRTY_IDX:
+ s = "dirty index";
+ break;
+ case LPROPS_FRDI_IDX:
+ s = "freeable index";
+ break;
+ default:
+ s = "index";
+ }
+ } else {
+ switch (lp->flags & LPROPS_CAT_MASK) {
+ case LPROPS_UNCAT:
+ s = "not categorized";
+ break;
+ case LPROPS_DIRTY:
+ s = "dirty";
+ break;
+ case LPROPS_FREE:
+ s = "free";
+ break;
+ case LPROPS_EMPTY:
+ s = "empty";
+ break;
+ case LPROPS_FREEABLE:
+ s = "freeable";
+ break;
+ default:
+ s = NULL;
+ break;
+ }
+ }
+ printk(KERN_CONT "%s", s);
+ }
+
+ for (rb = rb_first((struct rb_root *)&c->buds); rb; rb = rb_next(rb)) {
+ bud = rb_entry(rb, struct ubifs_bud, rb);
+ if (bud->lnum == lp->lnum) {
+ int head = 0;
+ for (i = 0; i < c->jhead_cnt; i++) {
+ if (lp->lnum == c->jheads[i].wbuf.lnum) {
+ printk(KERN_CONT ", jhead %s",
+ dbg_jhead(i));
+ head = 1;
+ }
+ }
+ if (!head)
+ printk(KERN_CONT ", bud of jhead %s",
+ dbg_jhead(bud->jhead));
+ }
+ }
+ if (lp->lnum == c->gc_lnum)
+ printk(KERN_CONT ", GC LEB");
+ printk(KERN_CONT ")\n");
}
void dbg_dump_lprops(struct ubifs_info *c)
@@ -724,7 +820,7 @@ void dbg_dump_leb(const struct ubifs_info *c, int lnum)
printk(KERN_DEBUG "(pid %d) start dumping LEB %d\n",
current->pid, lnum);
- sleb = ubifs_scan(c, lnum, 0, c->dbg->buf);
+ sleb = ubifs_scan(c, lnum, 0, c->dbg->buf, 0);
if (IS_ERR(sleb)) {
ubifs_err("scan error %d", (int)PTR_ERR(sleb));
return;
@@ -909,8 +1005,10 @@ out:
ubifs_msg("saved lprops statistics dump");
dbg_dump_lstats(&d->saved_lst);
ubifs_get_lp_stats(c, &lst);
+
ubifs_msg("current lprops statistics dump");
- dbg_dump_lstats(&d->saved_lst);
+ dbg_dump_lstats(&lst);
+
spin_lock(&c->space_lock);
dbg_dump_budg(c);
spin_unlock(&c->space_lock);
diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h
index c1cd73b2e06e..29d960101ea6 100644
--- a/fs/ubifs/debug.h
+++ b/fs/ubifs/debug.h
@@ -271,6 +271,7 @@ void ubifs_debugging_exit(struct ubifs_info *c);
/* Dump functions */
const char *dbg_ntype(int type);
const char *dbg_cstate(int cmt_state);
+const char *dbg_jhead(int jhead);
const char *dbg_get_key_dump(const struct ubifs_info *c,
const union ubifs_key *key);
void dbg_dump_inode(const struct ubifs_info *c, const struct inode *inode);
@@ -321,6 +322,8 @@ void dbg_check_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat,
int dbg_check_lprops(struct ubifs_info *c);
int dbg_check_lpt_nodes(struct ubifs_info *c, struct ubifs_cnode *cnode,
int row, int col);
+int dbg_check_inode_size(struct ubifs_info *c, const struct inode *inode,
+ loff_t size);
/* Force the use of in-the-gaps method for testing */
@@ -425,6 +428,7 @@ void dbg_debugfs_exit_fs(struct ubifs_info *c);
#define dbg_ntype(type) ""
#define dbg_cstate(cmt_state) ""
+#define dbg_jhead(jhead) ""
#define dbg_get_key_dump(c, key) ({})
#define dbg_dump_inode(c, inode) ({})
#define dbg_dump_node(c, node) ({})
@@ -460,6 +464,7 @@ void dbg_debugfs_exit_fs(struct ubifs_info *c);
#define dbg_check_heap(c, heap, cat, add_pos) ({})
#define dbg_check_lprops(c) 0
#define dbg_check_lpt_nodes(c, cnode, row, col) 0
+#define dbg_check_inode_size(c, inode, size) 0
#define dbg_force_in_the_gaps_enabled 0
#define dbg_force_in_the_gaps() 0
#define dbg_failure_mode 0
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 6d34dc7e33e1..2e6481a7701c 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -21,34 +21,32 @@
*/
/*
- * This file implements VFS file and inode operations of regular files, device
+ * This file implements VFS file and inode operations for regular files, device
* nodes and symlinks as well as address space operations.
*
- * UBIFS uses 2 page flags: PG_private and PG_checked. PG_private is set if the
- * page is dirty and is used for budgeting purposes - dirty pages should not be
- * budgeted. The PG_checked flag is set if full budgeting is required for the
- * page e.g., when it corresponds to a file hole or it is just beyond the file
- * size. The budgeting is done in 'ubifs_write_begin()', because it is OK to
- * fail in this function, and the budget is released in 'ubifs_write_end()'. So
- * the PG_private and PG_checked flags carry the information about how the page
- * was budgeted, to make it possible to release the budget properly.
+ * UBIFS uses 2 page flags: @PG_private and @PG_checked. @PG_private is set if
+ * the page is dirty and is used for optimization purposes - dirty pages are
+ * not budgeted so the flag shows that 'ubifs_write_end()' should not release
+ * the budget for this page. The @PG_checked flag is set if full budgeting is
+ * required for the page e.g., when it corresponds to a file hole or it is
+ * beyond the file size. The budgeting is done in 'ubifs_write_begin()', because
+ * it is OK to fail in this function, and the budget is released in
+ * 'ubifs_write_end()'. So the @PG_private and @PG_checked flags carry
+ * information about how the page was budgeted, to make it possible to release
+ * the budget properly.
*
- * A thing to keep in mind: inode's 'i_mutex' is locked in most VFS operations
- * we implement. However, this is not true for '->writepage()', which might be
- * called with 'i_mutex' unlocked. For example, when pdflush is performing
- * write-back, it calls 'writepage()' with unlocked 'i_mutex', although the
- * inode has 'I_LOCK' flag in this case. At "normal" work-paths 'i_mutex' is
- * locked in '->writepage', e.g. in "sys_write -> alloc_pages -> direct reclaim
- * path'. So, in '->writepage()' we are only guaranteed that the page is
- * locked.
+ * A thing to keep in mind: inode @i_mutex is locked in most VFS operations we
+ * implement. However, this is not true for 'ubifs_writepage()', which may be
+ * called with @i_mutex unlocked. For example, when pdflush is doing background
+ * write-back, it calls 'ubifs_writepage()' with unlocked @i_mutex. At "normal"
+ * work-paths the @i_mutex is locked in 'ubifs_writepage()', e.g. in the
+ * "sys_write -> alloc_pages -> direct reclaim path". So, in 'ubifs_writepage()'
+ * we are only guaranteed that the page is locked.
*
- * Similarly, 'i_mutex' does not have to be locked in readpage(), e.g.,
- * readahead path does not have it locked ("sys_read -> generic_file_aio_read
- * -> ondemand_readahead -> readpage"). In case of readahead, 'I_LOCK' flag is
- * not set as well. However, UBIFS disables readahead.
- *
- * This, for example means that there might be 2 concurrent '->writepage()'
- * calls for the same inode, but different inode dirty pages.
+ * Similarly, @i_mutex is not always locked in 'ubifs_readpage()', e.g., the
+ * read-ahead path does not lock it ("sys_read -> generic_file_aio_read ->
+ * ondemand_readahead -> readpage"). In case of readahead, @I_LOCK flag is not
+ * set as well. However, UBIFS disables readahead.
*/
#include "ubifs.h"
@@ -449,9 +447,9 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
/*
* We change whole page so no need to load it. But we
* have to set the @PG_checked flag to make the further
- * code the page is new. This might be not true, but it
- * is better to budget more that to read the page from
- * the media.
+ * code know that the page is new. This might be not
+ * true, but it is better to budget more than to read
+ * the page from the media.
*/
SetPageChecked(page);
skipped_read = 1;
@@ -497,8 +495,8 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
}
/*
- * Whee, we aquired budgeting quickly - without involving
- * garbage-collection, committing or forceing write-back. We return
+ * Whee, we acquired budgeting quickly - without involving
+ * garbage-collection, committing or forcing write-back. We return
* with @ui->ui_mutex locked if we are appending pages, and unlocked
* otherwise. This is an optimization (slightly hacky though).
*/
@@ -562,7 +560,7 @@ static int ubifs_write_end(struct file *file, struct address_space *mapping,
/*
* Return 0 to force VFS to repeat the whole operation, or the
- * error code if 'do_readpage()' failes.
+ * error code if 'do_readpage()' fails.
*/
copied = do_readpage(page);
goto out;
@@ -1175,11 +1173,11 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode,
ui->ui_size = inode->i_size;
/* Truncation changes inode [mc]time */
inode->i_mtime = inode->i_ctime = ubifs_current_time(inode);
- /* The other attributes may be changed at the same time as well */
+ /* Other attributes may be changed at the same time as well */
do_attr_changes(inode, attr);
-
err = ubifs_jnl_truncate(c, inode, old_size, new_size);
mutex_unlock(&ui->ui_mutex);
+
out_budg:
if (budgeted)
ubifs_release_budget(c, &req);
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index f0f5f15d384e..618c2701d3a7 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -529,7 +529,7 @@ int ubifs_garbage_collect_leb(struct ubifs_info *c, struct ubifs_lprops *lp)
* We scan the entire LEB even though we only really need to scan up to
* (c->leb_size - lp->free).
*/
- sleb = ubifs_scan(c, lnum, 0, c->sbuf);
+ sleb = ubifs_scan(c, lnum, 0, c->sbuf, 0);
if (IS_ERR(sleb))
return PTR_ERR(sleb);
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index 762a7d6cec73..e589fedaf1ef 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -297,7 +297,7 @@ static enum hrtimer_restart wbuf_timer_callback_nolock(struct hrtimer *timer)
{
struct ubifs_wbuf *wbuf = container_of(timer, struct ubifs_wbuf, timer);
- dbg_io("jhead %d", wbuf->jhead);
+ dbg_io("jhead %s", dbg_jhead(wbuf->jhead));
wbuf->need_sync = 1;
wbuf->c->need_wbuf_sync = 1;
ubifs_wake_up_bgt(wbuf->c);
@@ -314,7 +314,8 @@ static void new_wbuf_timer_nolock(struct ubifs_wbuf *wbuf)
if (wbuf->no_timer)
return;
- dbg_io("set timer for jhead %d, %llu-%llu millisecs", wbuf->jhead,
+ dbg_io("set timer for jhead %s, %llu-%llu millisecs",
+ dbg_jhead(wbuf->jhead),
div_u64(ktime_to_ns(wbuf->softlimit), USEC_PER_SEC),
div_u64(ktime_to_ns(wbuf->softlimit) + wbuf->delta,
USEC_PER_SEC));
@@ -351,8 +352,8 @@ int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf)
/* Write-buffer is empty or not seeked */
return 0;
- dbg_io("LEB %d:%d, %d bytes, jhead %d",
- wbuf->lnum, wbuf->offs, wbuf->used, wbuf->jhead);
+ dbg_io("LEB %d:%d, %d bytes, jhead %s",
+ wbuf->lnum, wbuf->offs, wbuf->used, dbg_jhead(wbuf->jhead));
ubifs_assert(!(c->vfs_sb->s_flags & MS_RDONLY));
ubifs_assert(!(wbuf->avail & 7));
ubifs_assert(wbuf->offs + c->min_io_size <= c->leb_size);
@@ -401,7 +402,7 @@ int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs,
{
const struct ubifs_info *c = wbuf->c;
- dbg_io("LEB %d:%d, jhead %d", lnum, offs, wbuf->jhead);
+ dbg_io("LEB %d:%d, jhead %s", lnum, offs, dbg_jhead(wbuf->jhead));
ubifs_assert(lnum >= 0 && lnum < c->leb_cnt);
ubifs_assert(offs >= 0 && offs <= c->leb_size);
ubifs_assert(offs % c->min_io_size == 0 && !(offs & 7));
@@ -508,9 +509,9 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
struct ubifs_info *c = wbuf->c;
int err, written, n, aligned_len = ALIGN(len, 8), offs;
- dbg_io("%d bytes (%s) to jhead %d wbuf at LEB %d:%d", len,
- dbg_ntype(((struct ubifs_ch *)buf)->node_type), wbuf->jhead,
- wbuf->lnum, wbuf->offs + wbuf->used);
+ dbg_io("%d bytes (%s) to jhead %s wbuf at LEB %d:%d", len,
+ dbg_ntype(((struct ubifs_ch *)buf)->node_type),
+ dbg_jhead(wbuf->jhead), wbuf->lnum, wbuf->offs + wbuf->used);
ubifs_assert(len > 0 && wbuf->lnum >= 0 && wbuf->lnum < c->leb_cnt);
ubifs_assert(wbuf->offs >= 0 && wbuf->offs % c->min_io_size == 0);
ubifs_assert(!(wbuf->offs & 7) && wbuf->offs <= c->leb_size);
@@ -535,8 +536,8 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
memcpy(wbuf->buf + wbuf->used, buf, len);
if (aligned_len == wbuf->avail) {
- dbg_io("flush jhead %d wbuf to LEB %d:%d",
- wbuf->jhead, wbuf->lnum, wbuf->offs);
+ dbg_io("flush jhead %s wbuf to LEB %d:%d",
+ dbg_jhead(wbuf->jhead), wbuf->lnum, wbuf->offs);
err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf,
wbuf->offs, c->min_io_size,
wbuf->dtype);
@@ -564,8 +565,8 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
* minimal I/O unit. We have to fill and flush write-buffer and switch
* to the next min. I/O unit.
*/
- dbg_io("flush jhead %d wbuf to LEB %d:%d",
- wbuf->jhead, wbuf->lnum, wbuf->offs);
+ dbg_io("flush jhead %s wbuf to LEB %d:%d",
+ dbg_jhead(wbuf->jhead), wbuf->lnum, wbuf->offs);
memcpy(wbuf->buf + wbuf->used, buf, wbuf->avail);
err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, wbuf->offs,
c->min_io_size, wbuf->dtype);
@@ -698,8 +699,8 @@ int ubifs_read_node_wbuf(struct ubifs_wbuf *wbuf, void *buf, int type, int len,
int err, rlen, overlap;
struct ubifs_ch *ch = buf;
- dbg_io("LEB %d:%d, %s, length %d, jhead %d", lnum, offs,
- dbg_ntype(type), len, wbuf->jhead);
+ dbg_io("LEB %d:%d, %s, length %d, jhead %s", lnum, offs,
+ dbg_ntype(type), len, dbg_jhead(wbuf->jhead));
ubifs_assert(wbuf && lnum >= 0 && lnum < c->leb_cnt && offs >= 0);
ubifs_assert(!(offs & 7) && offs < c->leb_size);
ubifs_assert(type >= 0 && type < UBIFS_NODE_TYPES_CNT);
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index 64b5f3a309f5..d321baeca68d 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -158,7 +158,7 @@ again:
* some. But the write-buffer mutex has to be unlocked because
* GC also takes it.
*/
- dbg_jnl("no free space jhead %d, run GC", jhead);
+ dbg_jnl("no free space in jhead %s, run GC", dbg_jhead(jhead));
mutex_unlock(&wbuf->io_mutex);
lnum = ubifs_garbage_collect(c, 0);
@@ -173,7 +173,8 @@ again:
* because we dropped @wbuf->io_mutex, so try once
* again.
*/
- dbg_jnl("GC couldn't make a free LEB for jhead %d", jhead);
+ dbg_jnl("GC couldn't make a free LEB for jhead %s",
+ dbg_jhead(jhead));
if (retries++ < 2) {
dbg_jnl("retry (%d)", retries);
goto again;
@@ -184,7 +185,7 @@ again:
}
mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
- dbg_jnl("got LEB %d for jhead %d", lnum, jhead);
+ dbg_jnl("got LEB %d for jhead %s", lnum, dbg_jhead(jhead));
avail = c->leb_size - wbuf->offs - wbuf->used;
if (wbuf->lnum != -1 && avail >= len) {
@@ -255,7 +256,8 @@ static int write_node(struct ubifs_info *c, int jhead, void *node, int len,
*lnum = c->jheads[jhead].wbuf.lnum;
*offs = c->jheads[jhead].wbuf.offs + c->jheads[jhead].wbuf.used;
- dbg_jnl("jhead %d, LEB %d:%d, len %d", jhead, *lnum, *offs, len);
+ dbg_jnl("jhead %s, LEB %d:%d, len %d",
+ dbg_jhead(jhead), *lnum, *offs, len);
ubifs_prepare_node(c, node, len, 0);
return ubifs_wbuf_write_nolock(wbuf, node, len);
@@ -285,7 +287,8 @@ static int write_head(struct ubifs_info *c, int jhead, void *buf, int len,
*lnum = c->jheads[jhead].wbuf.lnum;
*offs = c->jheads[jhead].wbuf.offs + c->jheads[jhead].wbuf.used;
- dbg_jnl("jhead %d, LEB %d:%d, len %d", jhead, *lnum, *offs, len);
+ dbg_jnl("jhead %s, LEB %d:%d, len %d",
+ dbg_jhead(jhead), *lnum, *offs, len);
err = ubifs_wbuf_write_nolock(wbuf, buf, len);
if (err)
diff --git a/fs/ubifs/key.h b/fs/ubifs/key.h
index 5fa27ea031ba..0f530c684f0b 100644
--- a/fs/ubifs/key.h
+++ b/fs/ubifs/key.h
@@ -229,23 +229,6 @@ static inline void xent_key_init(const struct ubifs_info *c,
}
/**
- * xent_key_init_hash - initialize extended attribute entry key without
- * re-calculating hash function.
- * @c: UBIFS file-system description object
- * @key: key to initialize
- * @inum: host inode number
- * @hash: extended attribute entry name hash
- */
-static inline void xent_key_init_hash(const struct ubifs_info *c,
- union ubifs_key *key, ino_t inum,
- uint32_t hash)
-{
- ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK));
- key->u32[0] = inum;
- key->u32[1] = hash | (UBIFS_XENT_KEY << UBIFS_S_KEY_HASH_BITS);
-}
-
-/**
* xent_key_init_flash - initialize on-flash extended attribute entry key.
* @c: UBIFS file-system description object
* @k: key to initialize
@@ -295,22 +278,15 @@ static inline void data_key_init(const struct ubifs_info *c,
}
/**
- * data_key_init_flash - initialize on-flash data key.
+ * highest_data_key - get the highest possible data key for an inode.
* @c: UBIFS file-system description object
- * @k: key to initialize
+ * @key: key to initialize
* @inum: inode number
- * @block: block number
*/
-static inline void data_key_init_flash(const struct ubifs_info *c, void *k,
- ino_t inum, unsigned int block)
+static inline void highest_data_key(const struct ubifs_info *c,
+ union ubifs_key *key, ino_t inum)
{
- union ubifs_key *key = k;
-
- ubifs_assert(!(block & ~UBIFS_S_KEY_BLOCK_MASK));
- key->j32[0] = cpu_to_le32(inum);
- key->j32[1] = cpu_to_le32(block |
- (UBIFS_DATA_KEY << UBIFS_S_KEY_BLOCK_BITS));
- memset(k + 8, 0, UBIFS_MAX_KEY_LEN - 8);
+ data_key_init(c, key, inum, UBIFS_S_KEY_BLOCK_MASK);
}
/**
@@ -554,4 +530,5 @@ static inline unsigned long long key_max_inode_size(const struct ubifs_info *c)
return 0;
}
}
+
#endif /* !__UBIFS_KEY_H__ */
diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c
index 56e33772a1ee..c345e125f42c 100644
--- a/fs/ubifs/log.c
+++ b/fs/ubifs/log.c
@@ -169,8 +169,8 @@ void ubifs_add_bud(struct ubifs_info *c, struct ubifs_bud *bud)
*/
c->bud_bytes += c->leb_size - bud->start;
- dbg_log("LEB %d:%d, jhead %d, bud_bytes %lld", bud->lnum,
- bud->start, bud->jhead, c->bud_bytes);
+ dbg_log("LEB %d:%d, jhead %s, bud_bytes %lld", bud->lnum,
+ bud->start, dbg_jhead(bud->jhead), c->bud_bytes);
spin_unlock(&c->buds_lock);
}
@@ -355,16 +355,16 @@ static void remove_buds(struct ubifs_info *c)
* heads (non-closed buds).
*/
c->cmt_bud_bytes += wbuf->offs - bud->start;
- dbg_log("preserve %d:%d, jhead %d, bud bytes %d, "
+ dbg_log("preserve %d:%d, jhead %s, bud bytes %d, "
"cmt_bud_bytes %lld", bud->lnum, bud->start,
- bud->jhead, wbuf->offs - bud->start,
+ dbg_jhead(bud->jhead), wbuf->offs - bud->start,
c->cmt_bud_bytes);
bud->start = wbuf->offs;
} else {
c->cmt_bud_bytes += c->leb_size - bud->start;
- dbg_log("remove %d:%d, jhead %d, bud bytes %d, "
+ dbg_log("remove %d:%d, jhead %s, bud bytes %d, "
"cmt_bud_bytes %lld", bud->lnum, bud->start,
- bud->jhead, c->leb_size - bud->start,
+ dbg_jhead(bud->jhead), c->leb_size - bud->start,
c->cmt_bud_bytes);
rb_erase(p1, &c->buds);
/*
@@ -429,7 +429,8 @@ int ubifs_log_start_commit(struct ubifs_info *c, int *ltail_lnum)
if (lnum == -1 || offs == c->leb_size)
continue;
- dbg_log("add ref to LEB %d:%d for jhead %d", lnum, offs, i);
+ dbg_log("add ref to LEB %d:%d for jhead %s",
+ lnum, offs, dbg_jhead(i));
ref = buf + len;
ref->ch.node_type = UBIFS_REF_NODE;
ref->lnum = cpu_to_le32(lnum);
@@ -695,7 +696,7 @@ int ubifs_consolidate_log(struct ubifs_info *c)
lnum = c->ltail_lnum;
write_lnum = lnum;
while (1) {
- sleb = ubifs_scan(c, lnum, 0, c->sbuf);
+ sleb = ubifs_scan(c, lnum, 0, c->sbuf, 0);
if (IS_ERR(sleb)) {
err = PTR_ERR(sleb);
goto out_free;
diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c
index 4cdd284dea56..4d4ca388889b 100644
--- a/fs/ubifs/lprops.c
+++ b/fs/ubifs/lprops.c
@@ -281,7 +281,7 @@ void ubifs_add_to_cat(struct ubifs_info *c, struct ubifs_lprops *lprops,
case LPROPS_FREE:
if (add_to_lpt_heap(c, lprops, cat))
break;
- /* No more room on heap so make it uncategorized */
+ /* No more room on heap so make it un-categorized */
cat = LPROPS_UNCAT;
/* Fall through */
case LPROPS_UNCAT:
@@ -375,8 +375,8 @@ void ubifs_replace_cat(struct ubifs_info *c, struct ubifs_lprops *old_lprops,
* @lprops: LEB properties
*
* A LEB may have fallen off of the bottom of a heap, and ended up as
- * uncategorized even though it has enough space for us now. If that is the case
- * this function will put the LEB back onto a heap.
+ * un-categorized even though it has enough space for us now. If that is the
+ * case this function will put the LEB back onto a heap.
*/
void ubifs_ensure_cat(struct ubifs_info *c, struct ubifs_lprops *lprops)
{
@@ -436,10 +436,10 @@ int ubifs_categorize_lprops(const struct ubifs_info *c,
/**
* change_category - change LEB properties category.
* @c: UBIFS file-system description object
- * @lprops: LEB properties to recategorize
+ * @lprops: LEB properties to re-categorize
*
* LEB properties are categorized to enable fast find operations. When the LEB
- * properties change they must be recategorized.
+ * properties change they must be re-categorized.
*/
static void change_category(struct ubifs_info *c, struct ubifs_lprops *lprops)
{
@@ -461,21 +461,18 @@ static void change_category(struct ubifs_info *c, struct ubifs_lprops *lprops)
}
/**
- * calc_dark - calculate LEB dark space size.
+ * ubifs_calc_dark - calculate LEB dark space size.
* @c: the UBIFS file-system description object
* @spc: amount of free and dirty space in the LEB
*
- * This function calculates amount of dark space in an LEB which has @spc bytes
- * of free and dirty space. Returns the calculations result.
+ * This function calculates and returns amount of dark space in an LEB which
+ * has @spc bytes of free and dirty space.
*
- * Dark space is the space which is not always usable - it depends on which
- * nodes are written in which order. E.g., if an LEB has only 512 free bytes,
- * it is dark space, because it cannot fit a large data node. So UBIFS cannot
- * count on this LEB and treat these 512 bytes as usable because it is not true
- * if, for example, only big chunks of uncompressible data will be written to
- * the FS.
+ * UBIFS is trying to account the space which might not be usable, and this
+ * space is called "dark space". For example, if an LEB has only %512 free
+ * bytes, it is dark space, because it cannot fit a large data node.
*/
-static int calc_dark(struct ubifs_info *c, int spc)
+int ubifs_calc_dark(const struct ubifs_info *c, int spc)
{
ubifs_assert(!(spc & 7));
@@ -518,7 +515,7 @@ static int is_lprops_dirty(struct ubifs_info *c, struct ubifs_lprops *lprops)
* @free: new free space amount
* @dirty: new dirty space amount
* @flags: new flags
- * @idx_gc_cnt: change to the count of idx_gc list
+ * @idx_gc_cnt: change to the count of @idx_gc list
*
* This function changes LEB properties (@free, @dirty or @flag). However, the
* property which has the %LPROPS_NC value is not changed. Returns a pointer to
@@ -535,7 +532,7 @@ const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c,
{
/*
* This is the only function that is allowed to change lprops, so we
- * discard the const qualifier.
+ * discard the "const" qualifier.
*/
struct ubifs_lprops *lprops = (struct ubifs_lprops *)lp;
@@ -575,7 +572,7 @@ const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c,
if (old_spc < c->dead_wm)
c->lst.total_dead -= old_spc;
else
- c->lst.total_dark -= calc_dark(c, old_spc);
+ c->lst.total_dark -= ubifs_calc_dark(c, old_spc);
c->lst.total_used -= c->leb_size - old_spc;
}
@@ -616,7 +613,7 @@ const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c,
if (new_spc < c->dead_wm)
c->lst.total_dead += new_spc;
else
- c->lst.total_dark += calc_dark(c, new_spc);
+ c->lst.total_dark += ubifs_calc_dark(c, new_spc);
c->lst.total_used += c->leb_size - new_spc;
}
@@ -1096,7 +1093,7 @@ static int scan_check_cb(struct ubifs_info *c,
}
}
- sleb = ubifs_scan(c, lnum, 0, c->dbg->buf);
+ sleb = ubifs_scan(c, lnum, 0, c->dbg->buf, 0);
if (IS_ERR(sleb)) {
/*
* After an unclean unmount, empty and freeable LEBs
@@ -1107,7 +1104,7 @@ static int scan_check_cb(struct ubifs_info *c,
"- continuing checking");
lst->empty_lebs += 1;
lst->total_free += c->leb_size;
- lst->total_dark += calc_dark(c, c->leb_size);
+ lst->total_dark += ubifs_calc_dark(c, c->leb_size);
return LPT_SCAN_CONTINUE;
}
@@ -1117,7 +1114,7 @@ static int scan_check_cb(struct ubifs_info *c,
"- continuing checking");
lst->total_free += lp->free;
lst->total_dirty += lp->dirty;
- lst->total_dark += calc_dark(c, c->leb_size);
+ lst->total_dark += ubifs_calc_dark(c, c->leb_size);
return LPT_SCAN_CONTINUE;
}
data->err = PTR_ERR(sleb);
@@ -1235,7 +1232,7 @@ static int scan_check_cb(struct ubifs_info *c,
if (spc < c->dead_wm)
lst->total_dead += spc;
else
- lst->total_dark += calc_dark(c, spc);
+ lst->total_dark += ubifs_calc_dark(c, spc);
}
ubifs_scan_destroy(sleb);
diff --git a/fs/ubifs/master.c b/fs/ubifs/master.c
index a88f33801b98..28beaeedadc0 100644
--- a/fs/ubifs/master.c
+++ b/fs/ubifs/master.c
@@ -29,7 +29,8 @@
* @c: UBIFS file-system description object
*
* This function scans the master node LEBs and search for the latest master
- * node. Returns zero in case of success and a negative error code in case of
+ * node. Returns zero in case of success, %-EUCLEAN if there master area is
+ * corrupted and requires recovery, and a negative error code in case of
* failure.
*/
static int scan_for_master(struct ubifs_info *c)
@@ -40,7 +41,7 @@ static int scan_for_master(struct ubifs_info *c)
lnum = UBIFS_MST_LNUM;
- sleb = ubifs_scan(c, lnum, 0, c->sbuf);
+ sleb = ubifs_scan(c, lnum, 0, c->sbuf, 1);
if (IS_ERR(sleb))
return PTR_ERR(sleb);
nodes_cnt = sleb->nodes_cnt;
@@ -48,7 +49,7 @@ static int scan_for_master(struct ubifs_info *c)
snod = list_entry(sleb->nodes.prev, struct ubifs_scan_node,
list);
if (snod->type != UBIFS_MST_NODE)
- goto out;
+ goto out_dump;
memcpy(c->mst_node, snod->node, snod->len);
offs = snod->offs;
}
@@ -56,7 +57,7 @@ static int scan_for_master(struct ubifs_info *c)
lnum += 1;
- sleb = ubifs_scan(c, lnum, 0, c->sbuf);
+ sleb = ubifs_scan(c, lnum, 0, c->sbuf, 1);
if (IS_ERR(sleb))
return PTR_ERR(sleb);
if (sleb->nodes_cnt != nodes_cnt)
@@ -65,7 +66,7 @@ static int scan_for_master(struct ubifs_info *c)
goto out;
snod = list_entry(sleb->nodes.prev, struct ubifs_scan_node, list);
if (snod->type != UBIFS_MST_NODE)
- goto out;
+ goto out_dump;
if (snod->offs != offs)
goto out;
if (memcmp((void *)c->mst_node + UBIFS_CH_SZ,
@@ -78,6 +79,12 @@ static int scan_for_master(struct ubifs_info *c)
out:
ubifs_scan_destroy(sleb);
+ return -EUCLEAN;
+
+out_dump:
+ ubifs_err("unexpected node type %d master LEB %d:%d",
+ snod->type, lnum, snod->offs);
+ ubifs_scan_destroy(sleb);
return -EINVAL;
}
@@ -256,7 +263,8 @@ int ubifs_read_master(struct ubifs_info *c)
err = scan_for_master(c);
if (err) {
- err = ubifs_recover_master_node(c);
+ if (err == -EUCLEAN)
+ err = ubifs_recover_master_node(c);
if (err)
/*
* Note, we do not free 'c->mst_node' here because the
diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c
index 152a7b34a141..82009c74b6a3 100644
--- a/fs/ubifs/orphan.c
+++ b/fs/ubifs/orphan.c
@@ -670,9 +670,10 @@ static int kill_orphans(struct ubifs_info *c)
struct ubifs_scan_leb *sleb;
dbg_rcvry("LEB %d", lnum);
- sleb = ubifs_scan(c, lnum, 0, c->sbuf);
+ sleb = ubifs_scan(c, lnum, 0, c->sbuf, 1);
if (IS_ERR(sleb)) {
- sleb = ubifs_recover_leb(c, lnum, 0, c->sbuf, 0);
+ if (PTR_ERR(sleb) == -EUCLEAN)
+ sleb = ubifs_recover_leb(c, lnum, 0, c->sbuf, 0);
if (IS_ERR(sleb)) {
err = PTR_ERR(sleb);
break;
@@ -899,7 +900,7 @@ static int dbg_scan_orphans(struct ubifs_info *c, struct check_info *ci)
for (lnum = c->orph_first; lnum <= c->orph_last; lnum++) {
struct ubifs_scan_leb *sleb;
- sleb = ubifs_scan(c, lnum, 0, c->dbg->buf);
+ sleb = ubifs_scan(c, lnum, 0, c->dbg->buf, 0);
if (IS_ERR(sleb)) {
err = PTR_ERR(sleb);
break;
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index e5f6cf8a1155..f94ddf7efba0 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -286,7 +286,7 @@ int ubifs_recover_master_node(struct ubifs_info *c)
mst = mst2;
}
- dbg_rcvry("recovered master node from LEB %d",
+ ubifs_msg("recovered master node from LEB %d",
(mst == mst1 ? UBIFS_MST_LNUM : UBIFS_MST_LNUM + 1));
memcpy(c->mst_node, mst, UBIFS_MST_NODE_SZ);
@@ -790,7 +790,7 @@ struct ubifs_scan_leb *ubifs_recover_log_leb(struct ubifs_info *c, int lnum,
* We can only recover at the end of the log, so check that the
* next log LEB is empty or out of date.
*/
- sleb = ubifs_scan(c, next_lnum, 0, sbuf);
+ sleb = ubifs_scan(c, next_lnum, 0, sbuf, 0);
if (IS_ERR(sleb))
return sleb;
if (sleb->nodes_cnt) {
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index 2970500f32df..5c2d6d759a3e 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -506,7 +506,7 @@ static int replay_bud(struct ubifs_info *c, int lnum, int offs, int jhead,
if (c->need_recovery)
sleb = ubifs_recover_leb(c, lnum, offs, c->sbuf, jhead != GCHD);
else
- sleb = ubifs_scan(c, lnum, offs, c->sbuf);
+ sleb = ubifs_scan(c, lnum, offs, c->sbuf, 0);
if (IS_ERR(sleb))
return PTR_ERR(sleb);
@@ -836,8 +836,8 @@ static int replay_log_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf)
const struct ubifs_cs_node *node;
dbg_mnt("replay log LEB %d:%d", lnum, offs);
- sleb = ubifs_scan(c, lnum, offs, sbuf);
- if (IS_ERR(sleb) ) {
+ sleb = ubifs_scan(c, lnum, offs, sbuf, c->need_recovery);
+ if (IS_ERR(sleb)) {
if (PTR_ERR(sleb) != -EUCLEAN || !c->need_recovery)
return PTR_ERR(sleb);
sleb = ubifs_recover_log_leb(c, lnum, offs, sbuf);
diff --git a/fs/ubifs/scan.c b/fs/ubifs/scan.c
index 892ebfee4fe5..96c525384191 100644
--- a/fs/ubifs/scan.c
+++ b/fs/ubifs/scan.c
@@ -108,10 +108,9 @@ int ubifs_scan_a_node(const struct ubifs_info *c, void *buf, int len, int lnum,
/* Make the node pads to 8-byte boundary */
if ((node_len + pad_len) & 7) {
- if (!quiet) {
+ if (!quiet)
dbg_err("bad padding length %d - %d",
offs, offs + node_len + pad_len);
- }
return SCANNED_A_BAD_PAD_NODE;
}
@@ -253,15 +252,19 @@ void ubifs_scanned_corruption(const struct ubifs_info *c, int lnum, int offs,
* @c: UBIFS file-system description object
* @lnum: logical eraseblock number
* @offs: offset to start at (usually zero)
- * @sbuf: scan buffer (must be c->leb_size)
+ * @sbuf: scan buffer (must be of @c->leb_size bytes in size)
+ * @quiet: print no messages
*
* This function scans LEB number @lnum and returns complete information about
* its contents. Returns the scaned information in case of success and,
* %-EUCLEAN if the LEB neads recovery, and other negative error codes in case
* of failure.
+ *
+ * If @quiet is non-zero, this function does not print large and scary
+ * error messages and flash dumps in case of errors.
*/
struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum,
- int offs, void *sbuf)
+ int offs, void *sbuf, int quiet)
{
void *buf = sbuf + offs;
int err, len = c->leb_size - offs;
@@ -280,7 +283,7 @@ struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum,
cond_resched();
- ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 0);
+ ret = ubifs_scan_a_node(c, buf, len, lnum, offs, quiet);
if (ret > 0) {
/* Padding bytes or a valid padding node */
offs += ret;
@@ -320,7 +323,9 @@ struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum,
}
if (offs % c->min_io_size) {
- ubifs_err("empty space starts at non-aligned offset %d", offs);
+ if (!quiet)
+ ubifs_err("empty space starts at non-aligned offset %d",
+ offs);
goto corrupted;;
}
@@ -331,18 +336,25 @@ struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum,
break;
for (; len; offs++, buf++, len--)
if (*(uint8_t *)buf != 0xff) {
- ubifs_err("corrupt empty space at LEB %d:%d",
- lnum, offs);
+ if (!quiet)
+ ubifs_err("corrupt empty space at LEB %d:%d",
+ lnum, offs);
goto corrupted;
}
return sleb;
corrupted:
- ubifs_scanned_corruption(c, lnum, offs, buf);
+ if (!quiet) {
+ ubifs_scanned_corruption(c, lnum, offs, buf);
+ ubifs_err("LEB %d scanning failed", lnum);
+ }
err = -EUCLEAN;
+ ubifs_scan_destroy(sleb);
+ return ERR_PTR(err);
+
error:
- ubifs_err("LEB %d scanning failed", lnum);
+ ubifs_err("LEB %d scanning failed, error %d", lnum, err);
ubifs_scan_destroy(sleb);
return ERR_PTR(err);
}
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 26d2e0d80465..333e181ee987 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -36,7 +36,6 @@
#include <linux/mount.h>
#include <linux/math64.h>
#include <linux/writeback.h>
-#include <linux/smp_lock.h>
#include "ubifs.h"
/*
@@ -318,6 +317,8 @@ static int ubifs_write_inode(struct inode *inode, int wait)
if (err)
ubifs_err("can't write inode %lu, error %d",
inode->i_ino, err);
+ else
+ err = dbg_check_inode_size(c, inode, ui->ui_size);
}
ui->dirty = 0;
@@ -438,12 +439,6 @@ static int ubifs_sync_fs(struct super_block *sb, int wait)
{
int i, err;
struct ubifs_info *c = sb->s_fs_info;
- struct writeback_control wbc = {
- .sync_mode = WB_SYNC_ALL,
- .range_start = 0,
- .range_end = LLONG_MAX,
- .nr_to_write = LONG_MAX,
- };
/*
* Zero @wait is just an advisory thing to help the file system shove
@@ -454,17 +449,6 @@ static int ubifs_sync_fs(struct super_block *sb, int wait)
return 0;
/*
- * VFS calls '->sync_fs()' before synchronizing all dirty inodes and
- * pages, so synchronize them first, then commit the journal. Strictly
- * speaking, it is not necessary to commit the journal here,
- * synchronizing write-buffers would be enough. But committing makes
- * UBIFS free space predictions much more accurate, so we want to let
- * the user be able to get more accurate results of 'statfs()' after
- * they synchronize the file system.
- */
- generic_sync_sb_inodes(sb, &wbc);
-
- /*
* Synchronize write buffers, because 'ubifs_run_commit()' does not
* do this if it waits for an already running commit.
*/
@@ -474,6 +458,13 @@ static int ubifs_sync_fs(struct super_block *sb, int wait)
return err;
}
+ /*
+ * Strictly speaking, it is not necessary to commit the journal here,
+ * synchronizing write-buffers would be enough. But committing makes
+ * UBIFS free space predictions much more accurate, so we want to let
+ * the user be able to get more accurate results of 'statfs()' after
+ * they synchronize the file system.
+ */
err = ubifs_run_commit(c);
if (err)
return err;
@@ -1726,8 +1717,6 @@ static void ubifs_put_super(struct super_block *sb)
ubifs_msg("un-mount UBI device %d, volume %d", c->vi.ubi_num,
c->vi.vol_id);
- lock_kernel();
-
/*
* The following asserts are only valid if there has not been a failure
* of the media. For example, there will be dirty inodes if we failed
@@ -1792,8 +1781,6 @@ static void ubifs_put_super(struct super_block *sb)
ubi_close_volume(c->ubi);
mutex_unlock(&c->umount_mutex);
kfree(c);
-
- unlock_kernel();
}
static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
@@ -1809,22 +1796,17 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
return err;
}
- lock_kernel();
if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) {
if (c->ro_media) {
ubifs_msg("cannot re-mount due to prior errors");
- unlock_kernel();
return -EROFS;
}
err = ubifs_remount_rw(c);
- if (err) {
- unlock_kernel();
+ if (err)
return err;
- }
} else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) {
if (c->ro_media) {
ubifs_msg("cannot re-mount due to prior errors");
- unlock_kernel();
return -EROFS;
}
ubifs_remount_ro(c);
@@ -1839,7 +1821,6 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
}
ubifs_assert(c->lst.taken_empty_lebs > 0);
- unlock_kernel();
return 0;
}
@@ -1971,6 +1952,7 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
*
* Read-ahead will be disabled because @c->bdi.ra_pages is 0.
*/
+ c->bdi.name = "ubifs",
c->bdi.capabilities = BDI_CAP_MAP_COPY;
c->bdi.unplug_io_fn = default_unplug_io_fn;
err = bdi_init(&c->bdi);
@@ -1985,6 +1967,7 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
if (err)
goto out_bdi;
+ sb->s_bdi = &c->bdi;
sb->s_fs_info = c;
sb->s_magic = UBIFS_SUPER_MAGIC;
sb->s_blocksize = UBIFS_BLOCK_SIZE;
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index f249f7b0d656..e5b1a7d00fa0 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -1159,8 +1159,8 @@ static struct ubifs_znode *dirty_cow_bottom_up(struct ubifs_info *c,
* o exact match, i.e. the found zero-level znode contains key @key, then %1
* is returned and slot number of the matched branch is stored in @n;
* o not exact match, which means that zero-level znode does not contain
- * @key, then %0 is returned and slot number of the closed branch is stored
- * in @n;
+ * @key, then %0 is returned and slot number of the closest branch is stored
+ * in @n;
* o @key is so small that it is even less than the lowest key of the
* leftmost zero-level node, then %0 is returned and %0 is stored in @n.
*
@@ -1433,7 +1433,7 @@ static int maybe_leb_gced(struct ubifs_info *c, int lnum, int gc_seq1)
* @lnum: LEB number is returned here
* @offs: offset is returned here
*
- * This function look up and reads node with key @key. The caller has to make
+ * This function looks up and reads node with key @key. The caller has to make
* sure the @node buffer is large enough to fit the node. Returns zero in case
* of success, %-ENOENT if the node was not found, and a negative error code in
* case of failure. The node location can be returned in @lnum and @offs.
@@ -3268,3 +3268,73 @@ out_unlock:
mutex_unlock(&c->tnc_mutex);
return err;
}
+
+#ifdef CONFIG_UBIFS_FS_DEBUG
+
+/**
+ * dbg_check_inode_size - check if inode size is correct.
+ * @c: UBIFS file-system description object
+ * @inum: inode number
+ * @size: inode size
+ *
+ * This function makes sure that the inode size (@size) is correct and it does
+ * not have any pages beyond @size. Returns zero if the inode is OK, %-EINVAL
+ * if it has a data page beyond @size, and other negative error code in case of
+ * other errors.
+ */
+int dbg_check_inode_size(struct ubifs_info *c, const struct inode *inode,
+ loff_t size)
+{
+ int err, n;
+ union ubifs_key from_key, to_key, *key;
+ struct ubifs_znode *znode;
+ unsigned int block;
+
+ if (!S_ISREG(inode->i_mode))
+ return 0;
+ if (!(ubifs_chk_flags & UBIFS_CHK_GEN))
+ return 0;
+
+ block = (size + UBIFS_BLOCK_SIZE - 1) >> UBIFS_BLOCK_SHIFT;
+ data_key_init(c, &from_key, inode->i_ino, block);
+ highest_data_key(c, &to_key, inode->i_ino);
+
+ mutex_lock(&c->tnc_mutex);
+ err = ubifs_lookup_level0(c, &from_key, &znode, &n);
+ if (err < 0)
+ goto out_unlock;
+
+ if (err) {
+ err = -EINVAL;
+ key = &from_key;
+ goto out_dump;
+ }
+
+ err = tnc_next(c, &znode, &n);
+ if (err == -ENOENT) {
+ err = 0;
+ goto out_unlock;
+ }
+ if (err < 0)
+ goto out_unlock;
+
+ ubifs_assert(err == 0);
+ key = &znode->zbranch[n].key;
+ if (!key_in_range(c, key, &from_key, &to_key))
+ goto out_unlock;
+
+out_dump:
+ block = key_block(c, key);
+ ubifs_err("inode %lu has size %lld, but there are data at offset %lld "
+ "(data key %s)", (unsigned long)inode->i_ino, size,
+ ((loff_t)block) << UBIFS_BLOCK_SHIFT, DBGKEY(key));
+ dbg_dump_inode(c, inode);
+ dbg_dump_stack();
+ err = -EINVAL;
+
+out_unlock:
+ mutex_unlock(&c->tnc_mutex);
+ return err;
+}
+
+#endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c
index fde8d127c768..53288e5d604e 100644
--- a/fs/ubifs/tnc_commit.c
+++ b/fs/ubifs/tnc_commit.c
@@ -245,7 +245,7 @@ static int layout_leb_in_gaps(struct ubifs_info *c, int *p)
* it is more comprehensive and less efficient than is needed for this
* purpose.
*/
- sleb = ubifs_scan(c, lnum, 0, c->ileb_buf);
+ sleb = ubifs_scan(c, lnum, 0, c->ileb_buf, 0);
c->ileb_len = 0;
if (IS_ERR(sleb))
return PTR_ERR(sleb);
diff --git a/fs/ubifs/ubifs-media.h b/fs/ubifs/ubifs-media.h
index 3eee07e0c495..191ca7863fe7 100644
--- a/fs/ubifs/ubifs-media.h
+++ b/fs/ubifs/ubifs-media.h
@@ -135,6 +135,13 @@
/* The key is always at the same position in all keyed nodes */
#define UBIFS_KEY_OFFSET offsetof(struct ubifs_ino_node, key)
+/* Garbage collector journal head number */
+#define UBIFS_GC_HEAD 0
+/* Base journal head number */
+#define UBIFS_BASE_HEAD 1
+/* Data journal head number */
+#define UBIFS_DATA_HEAD 2
+
/*
* LEB Properties Tree node types.
*
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index a29349094422..b2d976366a46 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -105,12 +105,10 @@
/* Number of non-data journal heads */
#define NONDATA_JHEADS_CNT 2
-/* Garbage collector head */
-#define GCHD 0
-/* Base journal head number */
-#define BASEHD 1
-/* First "general purpose" journal head */
-#define DATAHD 2
+/* Shorter names for journal head numbers for internal usage */
+#define GCHD UBIFS_GC_HEAD
+#define BASEHD UBIFS_BASE_HEAD
+#define DATAHD UBIFS_DATA_HEAD
/* 'No change' value for 'ubifs_change_lp()' */
#define LPROPS_NC 0x80000001
@@ -1451,7 +1449,7 @@ int ubifs_sync_wbufs_by_inode(struct ubifs_info *c, struct inode *inode);
/* scan.c */
struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum,
- int offs, void *sbuf);
+ int offs, void *sbuf, int quiet);
void ubifs_scan_destroy(struct ubifs_scan_leb *sleb);
int ubifs_scan_a_node(const struct ubifs_info *c, void *buf, int len, int lnum,
int offs, int quiet);
@@ -1676,6 +1674,7 @@ const struct ubifs_lprops *ubifs_fast_find_free(struct ubifs_info *c);
const struct ubifs_lprops *ubifs_fast_find_empty(struct ubifs_info *c);
const struct ubifs_lprops *ubifs_fast_find_freeable(struct ubifs_info *c);
const struct ubifs_lprops *ubifs_fast_find_frdi_idx(struct ubifs_info *c);
+int ubifs_calc_dark(const struct ubifs_info *c, int spc);
/* file.c */
int ubifs_fsync(struct file *file, struct dentry *dentry, int datasync);
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index adafcf556531..195830f47569 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -78,9 +78,9 @@ enum {
SECURITY_XATTR,
};
-static struct inode_operations none_inode_operations;
-static struct address_space_operations none_address_operations;
-static struct file_operations none_file_operations;
+static const struct inode_operations none_inode_operations;
+static const struct address_space_operations none_address_operations;
+static const struct file_operations none_file_operations;
/**
* create_xattr - create an extended attribute.
diff --git a/fs/udf/directory.c b/fs/udf/directory.c
index 1d2c570704c8..2ffdb6733af1 100644
--- a/fs/udf/directory.c
+++ b/fs/udf/directory.c
@@ -18,59 +18,6 @@
#include <linux/string.h>
#include <linux/buffer_head.h>
-#if 0
-static uint8_t *udf_filead_read(struct inode *dir, uint8_t *tmpad,
- uint8_t ad_size, struct kernel_lb_addr fe_loc,
- int *pos, int *offset, struct buffer_head **bh,
- int *error)
-{
- int loffset = *offset;
- int block;
- uint8_t *ad;
- int remainder;
-
- *error = 0;
-
- ad = (uint8_t *)(*bh)->b_data + *offset;
- *offset += ad_size;
-
- if (!ad) {
- brelse(*bh);
- *error = 1;
- return NULL;
- }
-
- if (*offset == dir->i_sb->s_blocksize) {
- brelse(*bh);
- block = udf_get_lb_pblock(dir->i_sb, fe_loc, ++*pos);
- if (!block)
- return NULL;
- *bh = udf_tread(dir->i_sb, block);
- if (!*bh)
- return NULL;
- } else if (*offset > dir->i_sb->s_blocksize) {
- ad = tmpad;
-
- remainder = dir->i_sb->s_blocksize - loffset;
- memcpy((uint8_t *)ad, (*bh)->b_data + loffset, remainder);
-
- brelse(*bh);
- block = udf_get_lb_pblock(dir->i_sb, fe_loc, ++*pos);
- if (!block)
- return NULL;
- (*bh) = udf_tread(dir->i_sb, block);
- if (!*bh)
- return NULL;
-
- memcpy((uint8_t *)ad + remainder, (*bh)->b_data,
- ad_size - remainder);
- *offset = ad_size - remainder;
- }
-
- return ad;
-}
-#endif
-
struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos,
struct udf_fileident_bh *fibh,
struct fileIdentDesc *cfi,
@@ -248,39 +195,6 @@ struct fileIdentDesc *udf_get_fileident(void *buffer, int bufsize, int *offset)
return fi;
}
-#if 0
-static struct extent_ad *udf_get_fileextent(void *buffer, int bufsize, int *offset)
-{
- struct extent_ad *ext;
- struct fileEntry *fe;
- uint8_t *ptr;
-
- if ((!buffer) || (!offset)) {
- printk(KERN_ERR "udf: udf_get_fileextent() invalidparms\n");
- return NULL;
- }
-
- fe = (struct fileEntry *)buffer;
-
- if (fe->descTag.tagIdent != cpu_to_le16(TAG_IDENT_FE)) {
- udf_debug("0x%x != TAG_IDENT_FE\n",
- le16_to_cpu(fe->descTag.tagIdent));
- return NULL;
- }
-
- ptr = (uint8_t *)(fe->extendedAttr) +
- le32_to_cpu(fe->lengthExtendedAttr);
-
- if ((*offset > 0) && (*offset < le32_to_cpu(fe->lengthAllocDescs)))
- ptr += *offset;
-
- ext = (struct extent_ad *)ptr;
-
- *offset = *offset + sizeof(struct extent_ad);
- return ext;
-}
-#endif
-
struct short_ad *udf_get_fileshortad(uint8_t *ptr, int maxoffset, uint32_t *offset,
int inc)
{
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 7464305382b5..b80cbd78833c 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -193,9 +193,11 @@ int udf_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
static int udf_release_file(struct inode *inode, struct file *filp)
{
if (filp->f_mode & FMODE_WRITE) {
+ mutex_lock(&inode->i_mutex);
lock_kernel();
udf_discard_prealloc(inode);
unlock_kernel();
+ mutex_unlock(&inode->i_mutex);
}
return 0;
}
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index e7533f785636..6d24c2c63f93 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -90,19 +90,16 @@ no_delete:
}
/*
- * If we are going to release inode from memory, we discard preallocation and
- * truncate last inode extent to proper length. We could use drop_inode() but
- * it's called under inode_lock and thus we cannot mark inode dirty there. We
- * use clear_inode() but we have to make sure to write inode as it's not written
- * automatically.
+ * If we are going to release inode from memory, we truncate last inode extent
+ * to proper length. We could use drop_inode() but it's called under inode_lock
+ * and thus we cannot mark inode dirty there. We use clear_inode() but we have
+ * to make sure to write inode as it's not written automatically.
*/
void udf_clear_inode(struct inode *inode)
{
struct udf_inode_info *iinfo;
if (!(inode->i_sb->s_flags & MS_RDONLY)) {
lock_kernel();
- /* Discard preallocation for directories, symlinks, etc. */
- udf_discard_prealloc(inode);
udf_truncate_tail_extent(inode);
unlock_kernel();
write_inode_now(inode, 0);
@@ -664,8 +661,12 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
udf_split_extents(inode, &c, offset, newblocknum, laarr, &endnum);
#ifdef UDF_PREALLOCATE
- /* preallocate blocks */
- udf_prealloc_extents(inode, c, lastblock, laarr, &endnum);
+ /* We preallocate blocks only for regular files. It also makes sense
+ * for directories but there's a problem when to drop the
+ * preallocation. We might use some delayed work for that but I feel
+ * it's overengineering for a filesystem like UDF. */
+ if (S_ISREG(inode->i_mode))
+ udf_prealloc_extents(inode, c, lastblock, laarr, &endnum);
#endif
/* merge any continuous blocks in laarr */
diff --git a/fs/udf/lowlevel.c b/fs/udf/lowlevel.c
index 1b88fd5df05d..43e24a3b8e10 100644
--- a/fs/udf/lowlevel.c
+++ b/fs/udf/lowlevel.c
@@ -36,14 +36,10 @@ unsigned int udf_get_last_session(struct super_block *sb)
ms_info.addr_format = CDROM_LBA;
i = ioctl_by_bdev(bdev, CDROMMULTISESSION, (unsigned long)&ms_info);
-#define WE_OBEY_THE_WRITTEN_STANDARDS 1
-
if (i == 0) {
udf_debug("XA disk: %s, vol_desc_start=%d\n",
(ms_info.xa_flag ? "yes" : "no"), ms_info.addr.lba);
-#if WE_OBEY_THE_WRITTEN_STANDARDS
if (ms_info.xa_flag) /* necessary for a valid ms_info.addr */
-#endif
vol_desc_start = ms_info.addr.lba;
} else {
udf_debug("CDROMMULTISESSION not supported: rc=%d\n", i);
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 6a29fa34c478..21dad8c608f9 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -943,7 +943,6 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
pc->componentType = 1;
pc->lengthComponentIdent = 0;
pc->componentFileVersionNum = 0;
- pc += sizeof(struct pathComponent);
elen += sizeof(struct pathComponent);
}
diff --git a/fs/xattr.c b/fs/xattr.c
index 1c3d0af59ddf..6d4f6d3449fb 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -66,22 +66,28 @@ xattr_permission(struct inode *inode, const char *name, int mask)
return inode_permission(inode, mask);
}
-int
-vfs_setxattr(struct dentry *dentry, const char *name, const void *value,
- size_t size, int flags)
+/**
+ * __vfs_setxattr_noperm - perform setxattr operation without performing
+ * permission checks.
+ *
+ * @dentry - object to perform setxattr on
+ * @name - xattr name to set
+ * @value - value to set @name to
+ * @size - size of @value
+ * @flags - flags to pass into filesystem operations
+ *
+ * returns the result of the internal setxattr or setsecurity operations.
+ *
+ * This function requires the caller to lock the inode's i_mutex before it
+ * is executed. It also assumes that the caller will make the appropriate
+ * permission checks.
+ */
+int __vfs_setxattr_noperm(struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags)
{
struct inode *inode = dentry->d_inode;
- int error;
-
- error = xattr_permission(inode, name, MAY_WRITE);
- if (error)
- return error;
+ int error = -EOPNOTSUPP;
- mutex_lock(&inode->i_mutex);
- error = security_inode_setxattr(dentry, name, value, size, flags);
- if (error)
- goto out;
- error = -EOPNOTSUPP;
if (inode->i_op->setxattr) {
error = inode->i_op->setxattr(dentry, name, value, size, flags);
if (!error) {
@@ -97,6 +103,29 @@ vfs_setxattr(struct dentry *dentry, const char *name, const void *value,
if (!error)
fsnotify_xattr(dentry);
}
+
+ return error;
+}
+
+
+int
+vfs_setxattr(struct dentry *dentry, const char *name, const void *value,
+ size_t size, int flags)
+{
+ struct inode *inode = dentry->d_inode;
+ int error;
+
+ error = xattr_permission(inode, name, MAY_WRITE);
+ if (error)
+ return error;
+
+ mutex_lock(&inode->i_mutex);
+ error = security_inode_setxattr(dentry, name, value, size, flags);
+ if (error)
+ goto out;
+
+ error = __vfs_setxattr_noperm(dentry, name, value, size, flags);
+
out:
mutex_unlock(&inode->i_mutex);
return error;
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index aecf2519db76..d5e5559e31db 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -216,7 +216,6 @@ xfs_setfilesize(
if (ip->i_d.di_size < isize) {
ip->i_d.di_size = isize;
ip->i_update_core = 1;
- ip->i_update_size = 1;
xfs_mark_inode_dirty_sync(ip);
}
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 0c93c7ef3d18..965df1227d64 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -770,7 +770,7 @@ xfs_buf_associate_memory(
bp->b_pages = NULL;
bp->b_addr = mem;
- rval = _xfs_buf_get_pages(bp, page_count, 0);
+ rval = _xfs_buf_get_pages(bp, page_count, XBF_DONT_BLOCK);
if (rval)
return rval;
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index 0542fd507649..988d8f87bc0f 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -172,12 +172,21 @@ xfs_file_release(
*/
STATIC int
xfs_file_fsync(
- struct file *filp,
- struct dentry *dentry,
- int datasync)
+ struct file *file,
+ struct dentry *dentry,
+ int datasync)
{
- xfs_iflags_clear(XFS_I(dentry->d_inode), XFS_ITRUNCATED);
- return -xfs_fsync(XFS_I(dentry->d_inode));
+ struct inode *inode = dentry->d_inode;
+ struct xfs_inode *ip = XFS_I(inode);
+ int error;
+
+ /* capture size updates in I/O completion before writing the inode. */
+ error = filemap_fdatawait(inode->i_mapping);
+ if (error)
+ return error;
+
+ xfs_iflags_clear(ip, XFS_ITRUNCATED);
+ return -xfs_fsync(ip);
}
STATIC int
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c
index 0882d166239a..eafcc7c18706 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.c
@@ -619,7 +619,7 @@ xfs_file_compat_ioctl(
case XFS_IOC_GETVERSION_32:
cmd = _NATIVE_IOC(cmd, long);
return xfs_file_ioctl(filp, cmd, p);
- case XFS_IOC_SWAPEXT: {
+ case XFS_IOC_SWAPEXT_32: {
struct xfs_swapext sxp;
struct compat_xfs_swapext __user *sxu = arg;
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 8070b34cc287..da0159d99f82 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -43,7 +43,6 @@
#include "xfs_error.h"
#include "xfs_itable.h"
#include "xfs_rw.h"
-#include "xfs_acl.h"
#include "xfs_attr.h"
#include "xfs_buf_item.h"
#include "xfs_utils.h"
@@ -485,14 +484,6 @@ xfs_vn_put_link(
}
STATIC int
-xfs_vn_permission(
- struct inode *inode,
- int mask)
-{
- return generic_permission(inode, mask, xfs_check_acl);
-}
-
-STATIC int
xfs_vn_getattr(
struct vfsmount *mnt,
struct dentry *dentry,
@@ -696,7 +687,7 @@ xfs_vn_fiemap(
}
static const struct inode_operations xfs_inode_operations = {
- .permission = xfs_vn_permission,
+ .check_acl = xfs_check_acl,
.truncate = xfs_vn_truncate,
.getattr = xfs_vn_getattr,
.setattr = xfs_vn_setattr,
@@ -724,7 +715,7 @@ static const struct inode_operations xfs_dir_inode_operations = {
.rmdir = xfs_vn_unlink,
.mknod = xfs_vn_mknod,
.rename = xfs_vn_rename,
- .permission = xfs_vn_permission,
+ .check_acl = xfs_check_acl,
.getattr = xfs_vn_getattr,
.setattr = xfs_vn_setattr,
.setxattr = generic_setxattr,
@@ -749,7 +740,7 @@ static const struct inode_operations xfs_dir_ci_inode_operations = {
.rmdir = xfs_vn_unlink,
.mknod = xfs_vn_mknod,
.rename = xfs_vn_rename,
- .permission = xfs_vn_permission,
+ .check_acl = xfs_check_acl,
.getattr = xfs_vn_getattr,
.setattr = xfs_vn_setattr,
.setxattr = generic_setxattr,
@@ -762,7 +753,7 @@ static const struct inode_operations xfs_symlink_inode_operations = {
.readlink = generic_readlink,
.follow_link = xfs_vn_follow_link,
.put_link = xfs_vn_put_link,
- .permission = xfs_vn_permission,
+ .check_acl = xfs_check_acl,
.getattr = xfs_vn_getattr,
.setattr = xfs_vn_setattr,
.setxattr = generic_setxattr,
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index 7078974a6eee..49e4a6aea73c 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -812,18 +812,21 @@ write_retry:
/* Handle various SYNC-type writes */
if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) {
+ loff_t end = pos + ret - 1;
int error2;
xfs_iunlock(xip, iolock);
if (need_i_mutex)
mutex_unlock(&inode->i_mutex);
- error2 = sync_page_range(inode, mapping, pos, ret);
+
+ error2 = filemap_write_and_wait_range(mapping, pos, end);
if (!error)
error = error2;
if (need_i_mutex)
mutex_lock(&inode->i_mutex);
xfs_ilock(xip, iolock);
- error2 = xfs_write_sync_logforce(mp, xip);
+
+ error2 = xfs_fsync(xip);
if (!error)
error = error2;
}
diff --git a/fs/xfs/linux-2.6/xfs_quotaops.c b/fs/xfs/linux-2.6/xfs_quotaops.c
index cb6e2cca214f..9e41f91aa269 100644
--- a/fs/xfs/linux-2.6/xfs_quotaops.c
+++ b/fs/xfs/linux-2.6/xfs_quotaops.c
@@ -150,7 +150,7 @@ xfs_fs_set_xquota(
return -xfs_qm_scall_setqlim(mp, id, xfs_quota_type(type), fdq);
}
-struct quotactl_ops xfs_quotactl_operations = {
+const struct quotactl_ops xfs_quotactl_operations = {
.quota_sync = xfs_fs_quota_sync,
.get_xstate = xfs_fs_get_xstate,
.set_xstate = xfs_fs_set_xstate,
diff --git a/fs/xfs/linux-2.6/xfs_stats.c b/fs/xfs/linux-2.6/xfs_stats.c
index c3526d445f6a..76fdc5861932 100644
--- a/fs/xfs/linux-2.6/xfs_stats.c
+++ b/fs/xfs/linux-2.6/xfs_stats.c
@@ -20,16 +20,9 @@
DEFINE_PER_CPU(struct xfsstats, xfsstats);
-STATIC int
-xfs_read_xfsstats(
- char *buffer,
- char **start,
- off_t offset,
- int count,
- int *eof,
- void *data)
+static int xfs_stat_proc_show(struct seq_file *m, void *v)
{
- int c, i, j, len, val;
+ int c, i, j, val;
__uint64_t xs_xstrat_bytes = 0;
__uint64_t xs_write_bytes = 0;
__uint64_t xs_read_bytes = 0;
@@ -60,18 +53,18 @@ xfs_read_xfsstats(
};
/* Loop over all stats groups */
- for (i=j=len = 0; i < ARRAY_SIZE(xstats); i++) {
- len += sprintf(buffer + len, "%s", xstats[i].desc);
+ for (i=j = 0; i < ARRAY_SIZE(xstats); i++) {
+ seq_printf(m, "%s", xstats[i].desc);
/* inner loop does each group */
while (j < xstats[i].endpoint) {
val = 0;
/* sum over all cpus */
for_each_possible_cpu(c)
val += *(((__u32*)&per_cpu(xfsstats, c) + j));
- len += sprintf(buffer + len, " %u", val);
+ seq_printf(m, " %u", val);
j++;
}
- buffer[len++] = '\n';
+ seq_putc(m, '\n');
}
/* extra precision counters */
for_each_possible_cpu(i) {
@@ -80,36 +73,38 @@ xfs_read_xfsstats(
xs_read_bytes += per_cpu(xfsstats, i).xs_read_bytes;
}
- len += sprintf(buffer + len, "xpc %Lu %Lu %Lu\n",
+ seq_printf(m, "xpc %Lu %Lu %Lu\n",
xs_xstrat_bytes, xs_write_bytes, xs_read_bytes);
- len += sprintf(buffer + len, "debug %u\n",
+ seq_printf(m, "debug %u\n",
#if defined(DEBUG)
1);
#else
0);
#endif
+ return 0;
+}
- if (offset >= len) {
- *start = buffer;
- *eof = 1;
- return 0;
- }
- *start = buffer + offset;
- if ((len -= offset) > count)
- return count;
- *eof = 1;
-
- return len;
+static int xfs_stat_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, xfs_stat_proc_show, NULL);
}
+static const struct file_operations xfs_stat_proc_fops = {
+ .owner = THIS_MODULE,
+ .open = xfs_stat_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
int
xfs_init_procfs(void)
{
if (!proc_mkdir("fs/xfs", NULL))
goto out;
- if (!create_proc_read_entry("fs/xfs/stat", 0, NULL,
- xfs_read_xfsstats, NULL))
+ if (!proc_create("fs/xfs/stat", 0, NULL,
+ &xfs_stat_proc_fops))
goto out_remove_entry;
return 0;
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index a220d36f789b..bdd41c8c342f 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -67,7 +67,7 @@
#include <linux/freezer.h>
#include <linux/parser.h>
-static struct super_operations xfs_super_operations;
+static const struct super_operations xfs_super_operations;
static kmem_zone_t *xfs_ioend_zone;
mempool_t *xfs_ioend_pool;
@@ -579,15 +579,19 @@ xfs_showargs(
else if (mp->m_qflags & XFS_UQUOTA_ACCT)
seq_puts(m, "," MNTOPT_UQUOTANOENF);
- if (mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_OQUOTA_ENFD))
- seq_puts(m, "," MNTOPT_PRJQUOTA);
- else if (mp->m_qflags & XFS_PQUOTA_ACCT)
- seq_puts(m, "," MNTOPT_PQUOTANOENF);
-
- if (mp->m_qflags & (XFS_GQUOTA_ACCT|XFS_OQUOTA_ENFD))
- seq_puts(m, "," MNTOPT_GRPQUOTA);
- else if (mp->m_qflags & XFS_GQUOTA_ACCT)
- seq_puts(m, "," MNTOPT_GQUOTANOENF);
+ /* Either project or group quotas can be active, not both */
+
+ if (mp->m_qflags & XFS_PQUOTA_ACCT) {
+ if (mp->m_qflags & XFS_OQUOTA_ENFD)
+ seq_puts(m, "," MNTOPT_PRJQUOTA);
+ else
+ seq_puts(m, "," MNTOPT_PQUOTANOENF);
+ } else if (mp->m_qflags & XFS_GQUOTA_ACCT) {
+ if (mp->m_qflags & XFS_OQUOTA_ENFD)
+ seq_puts(m, "," MNTOPT_GRPQUOTA);
+ else
+ seq_puts(m, "," MNTOPT_GQUOTANOENF);
+ }
if (!(mp->m_qflags & XFS_ALL_QUOTA_ACCT))
seq_puts(m, "," MNTOPT_NOQUOTA);
@@ -687,7 +691,7 @@ xfs_barrier_test(
return error;
}
-void
+STATIC void
xfs_mountfs_check_barriers(xfs_mount_t *mp)
{
int error;
@@ -1532,7 +1536,7 @@ xfs_fs_get_sb(
mnt);
}
-static struct super_operations xfs_super_operations = {
+static const struct super_operations xfs_super_operations = {
.alloc_inode = xfs_fs_alloc_inode,
.destroy_inode = xfs_fs_destroy_inode,
.write_inode = xfs_fs_write_inode,
diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/linux-2.6/xfs_super.h
index 5a2ea3a21781..18175ebd58ed 100644
--- a/fs/xfs/linux-2.6/xfs_super.h
+++ b/fs/xfs/linux-2.6/xfs_super.h
@@ -93,7 +93,7 @@ extern void xfs_blkdev_issue_flush(struct xfs_buftarg *);
extern const struct export_operations xfs_export_operations;
extern struct xattr_handler *xfs_xattr_handlers[];
-extern struct quotactl_ops xfs_quotactl_operations;
+extern const struct quotactl_ops xfs_quotactl_operations;
#define XFS_M(sb) ((struct xfs_mount *)((sb)->s_fs_info))
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index b619d6b8ca43..320be6aea492 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -708,6 +708,16 @@ xfs_reclaim_inode(
return 0;
}
+void
+__xfs_inode_set_reclaim_tag(
+ struct xfs_perag *pag,
+ struct xfs_inode *ip)
+{
+ radix_tree_tag_set(&pag->pag_ici_root,
+ XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
+ XFS_ICI_RECLAIM_TAG);
+}
+
/*
* We set the inode flag atomically with the radix tree tag.
* Once we get tag lookups on the radix tree, this inode flag
@@ -722,8 +732,7 @@ xfs_inode_set_reclaim_tag(
read_lock(&pag->pag_ici_lock);
spin_lock(&ip->i_flags_lock);
- radix_tree_tag_set(&pag->pag_ici_root,
- XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
+ __xfs_inode_set_reclaim_tag(pag, ip);
__xfs_iflags_set(ip, XFS_IRECLAIMABLE);
spin_unlock(&ip->i_flags_lock);
read_unlock(&pag->pag_ici_lock);
@@ -740,21 +749,6 @@ __xfs_inode_clear_reclaim_tag(
XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
}
-void
-xfs_inode_clear_reclaim_tag(
- xfs_inode_t *ip)
-{
- xfs_mount_t *mp = ip->i_mount;
- xfs_perag_t *pag = xfs_get_perag(mp, ip->i_ino);
-
- read_lock(&pag->pag_ici_lock);
- spin_lock(&ip->i_flags_lock);
- __xfs_inode_clear_reclaim_tag(mp, pag, ip);
- spin_unlock(&ip->i_flags_lock);
- read_unlock(&pag->pag_ici_lock);
- xfs_put_perag(mp, pag);
-}
-
STATIC int
xfs_reclaim_inode_now(
struct xfs_inode *ip,
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
index 2a10301c99c7..27920eb7a820 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -48,7 +48,7 @@ int xfs_reclaim_inode(struct xfs_inode *ip, int locked, int sync_mode);
int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
-void xfs_inode_clear_reclaim_tag(struct xfs_inode *ip);
+void __xfs_inode_set_reclaim_tag(struct xfs_perag *pag, struct xfs_inode *ip);
void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag,
struct xfs_inode *ip);
diff --git a/fs/xfs/quota/xfs_qm_stats.c b/fs/xfs/quota/xfs_qm_stats.c
index 21b08c0396a1..83e7ea3e25fa 100644
--- a/fs/xfs/quota/xfs_qm_stats.c
+++ b/fs/xfs/quota/xfs_qm_stats.c
@@ -48,50 +48,34 @@
struct xqmstats xqmstats;
-STATIC int
-xfs_qm_read_xfsquota(
- char *buffer,
- char **start,
- off_t offset,
- int count,
- int *eof,
- void *data)
+static int xqm_proc_show(struct seq_file *m, void *v)
{
- int len;
-
/* maximum; incore; ratio free to inuse; freelist */
- len = sprintf(buffer, "%d\t%d\t%d\t%u\n",
+ seq_printf(m, "%d\t%d\t%d\t%u\n",
ndquot,
xfs_Gqm? atomic_read(&xfs_Gqm->qm_totaldquots) : 0,
xfs_Gqm? xfs_Gqm->qm_dqfree_ratio : 0,
xfs_Gqm? xfs_Gqm->qm_dqfreelist.qh_nelems : 0);
-
- if (offset >= len) {
- *start = buffer;
- *eof = 1;
- return 0;
- }
- *start = buffer + offset;
- if ((len -= offset) > count)
- return count;
- *eof = 1;
-
- return len;
+ return 0;
}
-STATIC int
-xfs_qm_read_stats(
- char *buffer,
- char **start,
- off_t offset,
- int count,
- int *eof,
- void *data)
+static int xqm_proc_open(struct inode *inode, struct file *file)
{
- int len;
+ return single_open(file, xqm_proc_show, NULL);
+}
+
+static const struct file_operations xqm_proc_fops = {
+ .owner = THIS_MODULE,
+ .open = xqm_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+static int xqmstat_proc_show(struct seq_file *m, void *v)
+{
/* quota performance statistics */
- len = sprintf(buffer, "qm %u %u %u %u %u %u %u %u\n",
+ seq_printf(m, "qm %u %u %u %u %u %u %u %u\n",
xqmstats.xs_qm_dqreclaims,
xqmstats.xs_qm_dqreclaim_misses,
xqmstats.xs_qm_dquot_dups,
@@ -100,25 +84,27 @@ xfs_qm_read_stats(
xqmstats.xs_qm_dqwants,
xqmstats.xs_qm_dqshake_reclaims,
xqmstats.xs_qm_dqinact_reclaims);
+ return 0;
+}
- if (offset >= len) {
- *start = buffer;
- *eof = 1;
- return 0;
- }
- *start = buffer + offset;
- if ((len -= offset) > count)
- return count;
- *eof = 1;
-
- return len;
+static int xqmstat_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, xqmstat_proc_show, NULL);
}
+static const struct file_operations xqmstat_proc_fops = {
+ .owner = THIS_MODULE,
+ .open = xqmstat_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
void
xfs_qm_init_procfs(void)
{
- create_proc_read_entry("fs/xfs/xqmstat", 0, NULL, xfs_qm_read_stats, NULL);
- create_proc_read_entry("fs/xfs/xqm", 0, NULL, xfs_qm_read_xfsquota, NULL);
+ proc_create("fs/xfs/xqmstat", 0, NULL, &xqmstat_proc_fops);
+ proc_create("fs/xfs/xqm", 0, NULL, &xqm_proc_fops);
}
void
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index f24b50b68d03..a5d54bf4931b 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -198,6 +198,15 @@ typedef struct xfs_perag
xfs_agino_t pagi_count; /* number of allocated inodes */
int pagb_count; /* pagb slots in use */
xfs_perag_busy_t *pagb_list; /* unstable blocks */
+
+ /*
+ * Inode allocation search lookup optimisation.
+ * If the pagino matches, the search for new inodes
+ * doesn't need to search the near ones again straight away
+ */
+ xfs_agino_t pagl_pagino;
+ xfs_agino_t pagl_leftrec;
+ xfs_agino_t pagl_rightrec;
#ifdef __KERNEL__
spinlock_t pagb_lock; /* lock for pagb_list */
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index db15feb906ff..4ece1906bd41 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -2010,7 +2010,9 @@ xfs_attr_rmtval_get(xfs_da_args_t *args)
dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock);
blkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount);
error = xfs_read_buf(mp, mp->m_ddev_targp, dblkno,
- blkcnt, XFS_BUF_LOCK, &bp);
+ blkcnt,
+ XFS_BUF_LOCK | XBF_DONT_BLOCK,
+ &bp);
if (error)
return(error);
@@ -2141,8 +2143,8 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock),
blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount);
- bp = xfs_buf_get_flags(mp->m_ddev_targp, dblkno,
- blkcnt, XFS_BUF_LOCK);
+ bp = xfs_buf_get_flags(mp->m_ddev_targp, dblkno, blkcnt,
+ XFS_BUF_LOCK | XBF_DONT_BLOCK);
ASSERT(bp);
ASSERT(!XFS_BUF_GETERROR(bp));
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 7928b9983c1d..8971fb09d387 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -3713,7 +3713,7 @@ done:
* entry (null if none). Else, *lastxp will be set to the index
* of the found entry; *gotp will contain the entry.
*/
-xfs_bmbt_rec_host_t * /* pointer to found extent entry */
+STATIC xfs_bmbt_rec_host_t * /* pointer to found extent entry */
xfs_bmap_search_multi_extents(
xfs_ifork_t *ifp, /* inode fork pointer */
xfs_fileoff_t bno, /* block number searched for */
@@ -6009,7 +6009,7 @@ xfs_getbmap(
*/
error = ENOMEM;
subnex = 16;
- map = kmem_alloc(subnex * sizeof(*map), KM_MAYFAIL);
+ map = kmem_alloc(subnex * sizeof(*map), KM_MAYFAIL | KM_NOFS);
if (!map)
goto out_unlock_ilock;
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 1b8ff9256bd0..56f62d2edc35 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -392,17 +392,6 @@ xfs_bmap_count_blocks(
int whichfork,
int *count);
-/*
- * Search the extent records for the entry containing block bno.
- * If bno lies in a hole, point to the next entry. If bno lies
- * past eof, *eofp will be set, and *prevp will contain the last
- * entry (null if none). Else, *lastxp will be set to the index
- * of the found entry; *gotp will contain the entry.
- */
-xfs_bmbt_rec_host_t *
-xfs_bmap_search_multi_extents(struct xfs_ifork *, xfs_fileoff_t, int *,
- xfs_extnum_t *, xfs_bmbt_irec_t *, xfs_bmbt_irec_t *);
-
#endif /* __KERNEL__ */
#endif /* __XFS_BMAP_H__ */
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index 5c1ade06578e..eb7b702d0690 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -202,16 +202,6 @@ xfs_bmbt_get_state(
ext_flag);
}
-/* Endian flipping versions of the bmbt extraction functions */
-void
-xfs_bmbt_disk_get_all(
- xfs_bmbt_rec_t *r,
- xfs_bmbt_irec_t *s)
-{
- __xfs_bmbt_get_all(get_unaligned_be64(&r->l0),
- get_unaligned_be64(&r->l1), s);
-}
-
/*
* Extract the blockcount field from an on disk bmap extent record.
*/
@@ -816,6 +806,16 @@ xfs_bmbt_trace_key(
*l1 = 0;
}
+/* Endian flipping versions of the bmbt extraction functions */
+STATIC void
+xfs_bmbt_disk_get_all(
+ xfs_bmbt_rec_t *r,
+ xfs_bmbt_irec_t *s)
+{
+ __xfs_bmbt_get_all(get_unaligned_be64(&r->l0),
+ get_unaligned_be64(&r->l1), s);
+}
+
STATIC void
xfs_bmbt_trace_record(
struct xfs_btree_cur *cur,
diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h
index 0e8df007615e..5549d495947f 100644
--- a/fs/xfs/xfs_bmap_btree.h
+++ b/fs/xfs/xfs_bmap_btree.h
@@ -220,7 +220,6 @@ extern xfs_fsblock_t xfs_bmbt_get_startblock(xfs_bmbt_rec_host_t *r);
extern xfs_fileoff_t xfs_bmbt_get_startoff(xfs_bmbt_rec_host_t *r);
extern xfs_exntst_t xfs_bmbt_get_state(xfs_bmbt_rec_host_t *r);
-extern void xfs_bmbt_disk_get_all(xfs_bmbt_rec_t *r, xfs_bmbt_irec_t *s);
extern xfs_filblks_t xfs_bmbt_disk_get_blockcount(xfs_bmbt_rec_t *r);
extern xfs_fileoff_t xfs_bmbt_disk_get_startoff(xfs_bmbt_rec_t *r);
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index e9df99574829..52b5f14d0c32 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -120,8 +120,8 @@ xfs_btree_check_sblock(
XFS_RANDOM_BTREE_CHECK_SBLOCK))) {
if (bp)
xfs_buftrace("SBTREE ERROR", bp);
- XFS_ERROR_REPORT("xfs_btree_check_sblock", XFS_ERRLEVEL_LOW,
- cur->bc_mp);
+ XFS_CORRUPTION_ERROR("xfs_btree_check_sblock",
+ XFS_ERRLEVEL_LOW, cur->bc_mp, block);
return XFS_ERROR(EFSCORRUPTED);
}
return 0;
@@ -646,46 +646,6 @@ xfs_btree_read_bufl(
}
/*
- * Get a buffer for the block, return it read in.
- * Short-form addressing.
- */
-int /* error */
-xfs_btree_read_bufs(
- xfs_mount_t *mp, /* file system mount point */
- xfs_trans_t *tp, /* transaction pointer */
- xfs_agnumber_t agno, /* allocation group number */
- xfs_agblock_t agbno, /* allocation group block number */
- uint lock, /* lock flags for read_buf */
- xfs_buf_t **bpp, /* buffer for agno/agbno */
- int refval) /* ref count value for buffer */
-{
- xfs_buf_t *bp; /* return value */
- xfs_daddr_t d; /* real disk block address */
- int error;
-
- ASSERT(agno != NULLAGNUMBER);
- ASSERT(agbno != NULLAGBLOCK);
- d = XFS_AGB_TO_DADDR(mp, agno, agbno);
- if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
- mp->m_bsize, lock, &bp))) {
- return error;
- }
- ASSERT(!bp || !XFS_BUF_GETERROR(bp));
- if (bp != NULL) {
- switch (refval) {
- case XFS_ALLOC_BTREE_REF:
- XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, refval);
- break;
- case XFS_INO_BTREE_REF:
- XFS_BUF_SET_VTYPE_REF(bp, B_FS_INOMAP, refval);
- break;
- }
- }
- *bpp = bp;
- return 0;
-}
-
-/*
* Read-ahead the block, don't wait for it, don't return a buffer.
* Long-form addressing.
*/
@@ -2951,7 +2911,7 @@ error0:
* inode we have to copy the single block it was pointing to into the
* inode.
*/
-int
+STATIC int
xfs_btree_kill_iroot(
struct xfs_btree_cur *cur)
{
diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h
index 4f852b735b96..7fa07062bdda 100644
--- a/fs/xfs/xfs_btree.h
+++ b/fs/xfs/xfs_btree.h
@@ -379,20 +379,6 @@ xfs_btree_read_bufl(
int refval);/* ref count value for buffer */
/*
- * Get a buffer for the block, return it read in.
- * Short-form addressing.
- */
-int /* error */
-xfs_btree_read_bufs(
- struct xfs_mount *mp, /* file system mount point */
- struct xfs_trans *tp, /* transaction pointer */
- xfs_agnumber_t agno, /* allocation group number */
- xfs_agblock_t agbno, /* allocation group block number */
- uint lock, /* lock flags for read_buf */
- struct xfs_buf **bpp, /* buffer for agno/agbno */
- int refval);/* ref count value for buffer */
-
-/*
* Read-ahead the block, don't wait for it, don't return a buffer.
* Long-form addressing.
*/
@@ -432,7 +418,6 @@ int xfs_btree_decrement(struct xfs_btree_cur *, int, int *);
int xfs_btree_lookup(struct xfs_btree_cur *, xfs_lookup_t, int *);
int xfs_btree_update(struct xfs_btree_cur *, union xfs_btree_rec *);
int xfs_btree_new_iroot(struct xfs_btree_cur *, int *, int *);
-int xfs_btree_kill_iroot(struct xfs_btree_cur *);
int xfs_btree_insert(struct xfs_btree_cur *, int *);
int xfs_btree_delete(struct xfs_btree_cur *, int *);
int xfs_btree_get_rec(struct xfs_btree_cur *, union xfs_btree_rec **, int *);
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 9ff6e57a5075..2847bbc1c534 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -2201,7 +2201,7 @@ kmem_zone_t *xfs_dabuf_zone; /* dabuf zone */
xfs_da_state_t *
xfs_da_state_alloc(void)
{
- return kmem_zone_zalloc(xfs_da_state_zone, KM_SLEEP);
+ return kmem_zone_zalloc(xfs_da_state_zone, KM_NOFS);
}
/*
@@ -2261,9 +2261,9 @@ xfs_da_buf_make(int nbuf, xfs_buf_t **bps, inst_t *ra)
int off;
if (nbuf == 1)
- dabuf = kmem_zone_alloc(xfs_dabuf_zone, KM_SLEEP);
+ dabuf = kmem_zone_alloc(xfs_dabuf_zone, KM_NOFS);
else
- dabuf = kmem_alloc(XFS_DA_BUF_SIZE(nbuf), KM_SLEEP);
+ dabuf = kmem_alloc(XFS_DA_BUF_SIZE(nbuf), KM_NOFS);
dabuf->dirty = 0;
#ifdef XFS_DABUF_DEBUG
dabuf->ra = ra;
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
index c657bec6d951..bb1d58eb3982 100644
--- a/fs/xfs/xfs_dir2.c
+++ b/fs/xfs/xfs_dir2.c
@@ -256,7 +256,7 @@ xfs_dir_cilookup_result(
!(args->op_flags & XFS_DA_OP_CILOOKUP))
return EEXIST;
- args->value = kmem_alloc(len, KM_MAYFAIL);
+ args->value = kmem_alloc(len, KM_NOFS | KM_MAYFAIL);
if (!args->value)
return ENOMEM;
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index c4ea51b55dce..f52ac276277e 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -117,7 +117,7 @@ struct getbmapx {
#define BMV_IF_VALID \
(BMV_IF_ATTRFORK|BMV_IF_NO_DMAPI_READ|BMV_IF_PREALLOC|BMV_IF_DELALLOC)
-/* bmv_oflags values - returned for for each non-header segment */
+/* bmv_oflags values - returned for each non-header segment */
#define BMV_OF_PREALLOC 0x1 /* segment = unwritten pre-allocation */
#define BMV_OF_DELALLOC 0x2 /* segment = delayed allocation */
#define BMV_OF_LAST 0x4 /* segment is the last in the file */
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index cbd451bb4848..2d0b3e1da9e6 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -167,17 +167,25 @@ xfs_growfs_data_private(
new = nb - mp->m_sb.sb_dblocks;
oagcount = mp->m_sb.sb_agcount;
if (nagcount > oagcount) {
+ void *new_perag, *old_perag;
+
xfs_filestream_flush(mp);
+
+ new_perag = kmem_zalloc(sizeof(xfs_perag_t) * nagcount,
+ KM_MAYFAIL);
+ if (!new_perag)
+ return XFS_ERROR(ENOMEM);
+
down_write(&mp->m_peraglock);
- mp->m_perag = kmem_realloc(mp->m_perag,
- sizeof(xfs_perag_t) * nagcount,
- sizeof(xfs_perag_t) * oagcount,
- KM_SLEEP);
- memset(&mp->m_perag[oagcount], 0,
- (nagcount - oagcount) * sizeof(xfs_perag_t));
+ memcpy(new_perag, mp->m_perag, sizeof(xfs_perag_t) * oagcount);
+ old_perag = mp->m_perag;
+ mp->m_perag = new_perag;
+
mp->m_flags |= XFS_MOUNT_32BITINODES;
nagimax = xfs_initialize_perag(mp, nagcount);
up_write(&mp->m_peraglock);
+
+ kmem_free(old_perag);
}
tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFS);
tp->t_flags |= XFS_TRANS_RESERVE;
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 3120a3a5e20f..ab64f3efb43b 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -57,75 +57,35 @@ xfs_ialloc_cluster_alignment(
}
/*
- * Lookup the record equal to ino in the btree given by cur.
- */
-STATIC int /* error */
-xfs_inobt_lookup_eq(
- struct xfs_btree_cur *cur, /* btree cursor */
- xfs_agino_t ino, /* starting inode of chunk */
- __int32_t fcnt, /* free inode count */
- xfs_inofree_t free, /* free inode mask */
- int *stat) /* success/failure */
-{
- cur->bc_rec.i.ir_startino = ino;
- cur->bc_rec.i.ir_freecount = fcnt;
- cur->bc_rec.i.ir_free = free;
- return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
-}
-
-/*
- * Lookup the first record greater than or equal to ino
- * in the btree given by cur.
+ * Lookup a record by ino in the btree given by cur.
*/
int /* error */
-xfs_inobt_lookup_ge(
+xfs_inobt_lookup(
struct xfs_btree_cur *cur, /* btree cursor */
xfs_agino_t ino, /* starting inode of chunk */
- __int32_t fcnt, /* free inode count */
- xfs_inofree_t free, /* free inode mask */
+ xfs_lookup_t dir, /* <=, >=, == */
int *stat) /* success/failure */
{
cur->bc_rec.i.ir_startino = ino;
- cur->bc_rec.i.ir_freecount = fcnt;
- cur->bc_rec.i.ir_free = free;
- return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
+ cur->bc_rec.i.ir_freecount = 0;
+ cur->bc_rec.i.ir_free = 0;
+ return xfs_btree_lookup(cur, dir, stat);
}
/*
- * Lookup the first record less than or equal to ino
- * in the btree given by cur.
- */
-int /* error */
-xfs_inobt_lookup_le(
- struct xfs_btree_cur *cur, /* btree cursor */
- xfs_agino_t ino, /* starting inode of chunk */
- __int32_t fcnt, /* free inode count */
- xfs_inofree_t free, /* free inode mask */
- int *stat) /* success/failure */
-{
- cur->bc_rec.i.ir_startino = ino;
- cur->bc_rec.i.ir_freecount = fcnt;
- cur->bc_rec.i.ir_free = free;
- return xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat);
-}
-
-/*
- * Update the record referred to by cur to the value given
- * by [ino, fcnt, free].
+ * Update the record referred to by cur to the value given.
* This either works (return 0) or gets an EFSCORRUPTED error.
*/
STATIC int /* error */
xfs_inobt_update(
struct xfs_btree_cur *cur, /* btree cursor */
- xfs_agino_t ino, /* starting inode of chunk */
- __int32_t fcnt, /* free inode count */
- xfs_inofree_t free) /* free inode mask */
+ xfs_inobt_rec_incore_t *irec) /* btree record */
{
union xfs_btree_rec rec;
- rec.inobt.ir_startino = cpu_to_be32(ino);
- rec.inobt.ir_freecount = cpu_to_be32(fcnt);
- rec.inobt.ir_free = cpu_to_be64(free);
+ rec.inobt.ir_startino = cpu_to_be32(irec->ir_startino);
+ rec.inobt.ir_freecount = cpu_to_be32(irec->ir_freecount);
+ rec.inobt.ir_free = cpu_to_be64(irec->ir_free);
return xfs_btree_update(cur, &rec);
}
@@ -135,9 +95,7 @@ xfs_inobt_update(
int /* error */
xfs_inobt_get_rec(
struct xfs_btree_cur *cur, /* btree cursor */
- xfs_agino_t *ino, /* output: starting inode of chunk */
- __int32_t *fcnt, /* output: number of free inodes */
- xfs_inofree_t *free, /* output: free inode mask */
+ xfs_inobt_rec_incore_t *irec, /* btree record */
int *stat) /* output: success/failure */
{
union xfs_btree_rec *rec;
@@ -145,14 +103,136 @@ xfs_inobt_get_rec(
error = xfs_btree_get_rec(cur, &rec, stat);
if (!error && *stat == 1) {
- *ino = be32_to_cpu(rec->inobt.ir_startino);
- *fcnt = be32_to_cpu(rec->inobt.ir_freecount);
- *free = be64_to_cpu(rec->inobt.ir_free);
+ irec->ir_startino = be32_to_cpu(rec->inobt.ir_startino);
+ irec->ir_freecount = be32_to_cpu(rec->inobt.ir_freecount);
+ irec->ir_free = be64_to_cpu(rec->inobt.ir_free);
}
return error;
}
/*
+ * Verify that the number of free inodes in the AGI is correct.
+ */
+#ifdef DEBUG
+STATIC int
+xfs_check_agi_freecount(
+ struct xfs_btree_cur *cur,
+ struct xfs_agi *agi)
+{
+ if (cur->bc_nlevels == 1) {
+ xfs_inobt_rec_incore_t rec;
+ int freecount = 0;
+ int error;
+ int i;
+
+ error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
+ if (error)
+ return error;
+
+ do {
+ error = xfs_inobt_get_rec(cur, &rec, &i);
+ if (error)
+ return error;
+
+ if (i) {
+ freecount += rec.ir_freecount;
+ error = xfs_btree_increment(cur, 0, &i);
+ if (error)
+ return error;
+ }
+ } while (i == 1);
+
+ if (!XFS_FORCED_SHUTDOWN(cur->bc_mp))
+ ASSERT(freecount == be32_to_cpu(agi->agi_freecount));
+ }
+ return 0;
+}
+#else
+#define xfs_check_agi_freecount(cur, agi) 0
+#endif
+
+/*
+ * Initialise a new set of inodes.
+ */
+STATIC void
+xfs_ialloc_inode_init(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ xfs_agnumber_t agno,
+ xfs_agblock_t agbno,
+ xfs_agblock_t length,
+ unsigned int gen)
+{
+ struct xfs_buf *fbuf;
+ struct xfs_dinode *free;
+ int blks_per_cluster, nbufs, ninodes;
+ int version;
+ int i, j;
+ xfs_daddr_t d;
+
+ /*
+ * Loop over the new block(s), filling in the inodes.
+ * For small block sizes, manipulate the inodes in buffers
+ * which are multiples of the blocks size.
+ */
+ if (mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) {
+ blks_per_cluster = 1;
+ nbufs = length;
+ ninodes = mp->m_sb.sb_inopblock;
+ } else {
+ blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) /
+ mp->m_sb.sb_blocksize;
+ nbufs = length / blks_per_cluster;
+ ninodes = blks_per_cluster * mp->m_sb.sb_inopblock;
+ }
+
+ /*
+ * Figure out what version number to use in the inodes we create.
+ * If the superblock version has caught up to the one that supports
+ * the new inode format, then use the new inode version. Otherwise
+ * use the old version so that old kernels will continue to be
+ * able to use the file system.
+ */
+ if (xfs_sb_version_hasnlink(&mp->m_sb))
+ version = 2;
+ else
+ version = 1;
+
+ for (j = 0; j < nbufs; j++) {
+ /*
+ * Get the block.
+ */
+ d = XFS_AGB_TO_DADDR(mp, agno, agbno + (j * blks_per_cluster));
+ fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
+ mp->m_bsize * blks_per_cluster,
+ XFS_BUF_LOCK);
+ ASSERT(fbuf);
+ ASSERT(!XFS_BUF_GETERROR(fbuf));
+
+ /*
+ * Initialize all inodes in this buffer and then log them.
+ *
+ * XXX: It would be much better if we had just one transaction
+ * to log a whole cluster of inodes instead of all the
+ * individual transactions causing a lot of log traffic.
+ */
+ xfs_biozero(fbuf, 0, ninodes << mp->m_sb.sb_inodelog);
+ for (i = 0; i < ninodes; i++) {
+ int ioffset = i << mp->m_sb.sb_inodelog;
+ uint isize = sizeof(struct xfs_dinode);
+
+ free = xfs_make_iptr(mp, fbuf, i);
+ free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
+ free->di_version = version;
+ free->di_gen = cpu_to_be32(gen);
+ free->di_next_unlinked = cpu_to_be32(NULLAGINO);
+ xfs_trans_log_buf(tp, fbuf, ioffset, ioffset + isize - 1);
+ }
+ xfs_trans_inode_alloc_buf(tp, fbuf);
+ }
+}
+
+/*
* Allocate new inodes in the allocation group specified by agbp.
* Return 0 for success, else error code.
*/
@@ -164,24 +244,15 @@ xfs_ialloc_ag_alloc(
{
xfs_agi_t *agi; /* allocation group header */
xfs_alloc_arg_t args; /* allocation argument structure */
- int blks_per_cluster; /* fs blocks per inode cluster */
xfs_btree_cur_t *cur; /* inode btree cursor */
- xfs_daddr_t d; /* disk addr of buffer */
xfs_agnumber_t agno;
int error;
- xfs_buf_t *fbuf; /* new free inodes' buffer */
- xfs_dinode_t *free; /* new free inode structure */
- int i; /* inode counter */
- int j; /* block counter */
- int nbufs; /* num bufs of new inodes */
+ int i;
xfs_agino_t newino; /* new first inode's number */
xfs_agino_t newlen; /* new number of inodes */
- int ninodes; /* num inodes per buf */
xfs_agino_t thisino; /* current inode number, for loop */
- int version; /* inode version number to use */
int isaligned = 0; /* inode allocation at stripe unit */
/* boundary */
- unsigned int gen;
args.tp = tp;
args.mp = tp->t_mountp;
@@ -202,12 +273,12 @@ xfs_ialloc_ag_alloc(
*/
agi = XFS_BUF_TO_AGI(agbp);
newino = be32_to_cpu(agi->agi_newino);
+ agno = be32_to_cpu(agi->agi_seqno);
args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) +
XFS_IALLOC_BLOCKS(args.mp);
if (likely(newino != NULLAGINO &&
(args.agbno < be32_to_cpu(agi->agi_length)))) {
- args.fsbno = XFS_AGB_TO_FSB(args.mp,
- be32_to_cpu(agi->agi_seqno), args.agbno);
+ args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
args.type = XFS_ALLOCTYPE_THIS_BNO;
args.mod = args.total = args.wasdel = args.isfl =
args.userdata = args.minalignslop = 0;
@@ -258,8 +329,7 @@ xfs_ialloc_ag_alloc(
* For now, just allocate blocks up front.
*/
args.agbno = be32_to_cpu(agi->agi_root);
- args.fsbno = XFS_AGB_TO_FSB(args.mp,
- be32_to_cpu(agi->agi_seqno), args.agbno);
+ args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
/*
* Allocate a fixed-size extent of inodes.
*/
@@ -282,8 +352,7 @@ xfs_ialloc_ag_alloc(
if (isaligned && args.fsbno == NULLFSBLOCK) {
args.type = XFS_ALLOCTYPE_NEAR_BNO;
args.agbno = be32_to_cpu(agi->agi_root);
- args.fsbno = XFS_AGB_TO_FSB(args.mp,
- be32_to_cpu(agi->agi_seqno), args.agbno);
+ args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
args.alignment = xfs_ialloc_cluster_alignment(&args);
if ((error = xfs_alloc_vextent(&args)))
return error;
@@ -294,85 +363,30 @@ xfs_ialloc_ag_alloc(
return 0;
}
ASSERT(args.len == args.minlen);
- /*
- * Convert the results.
- */
- newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0);
- /*
- * Loop over the new block(s), filling in the inodes.
- * For small block sizes, manipulate the inodes in buffers
- * which are multiples of the blocks size.
- */
- if (args.mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(args.mp)) {
- blks_per_cluster = 1;
- nbufs = (int)args.len;
- ninodes = args.mp->m_sb.sb_inopblock;
- } else {
- blks_per_cluster = XFS_INODE_CLUSTER_SIZE(args.mp) /
- args.mp->m_sb.sb_blocksize;
- nbufs = (int)args.len / blks_per_cluster;
- ninodes = blks_per_cluster * args.mp->m_sb.sb_inopblock;
- }
- /*
- * Figure out what version number to use in the inodes we create.
- * If the superblock version has caught up to the one that supports
- * the new inode format, then use the new inode version. Otherwise
- * use the old version so that old kernels will continue to be
- * able to use the file system.
- */
- if (xfs_sb_version_hasnlink(&args.mp->m_sb))
- version = 2;
- else
- version = 1;
/*
+ * Stamp and write the inode buffers.
+ *
* Seed the new inode cluster with a random generation number. This
* prevents short-term reuse of generation numbers if a chunk is
* freed and then immediately reallocated. We use random numbers
* rather than a linear progression to prevent the next generation
* number from being easily guessable.
*/
- gen = random32();
- for (j = 0; j < nbufs; j++) {
- /*
- * Get the block.
- */
- d = XFS_AGB_TO_DADDR(args.mp, be32_to_cpu(agi->agi_seqno),
- args.agbno + (j * blks_per_cluster));
- fbuf = xfs_trans_get_buf(tp, args.mp->m_ddev_targp, d,
- args.mp->m_bsize * blks_per_cluster,
- XFS_BUF_LOCK);
- ASSERT(fbuf);
- ASSERT(!XFS_BUF_GETERROR(fbuf));
+ xfs_ialloc_inode_init(args.mp, tp, agno, args.agbno, args.len,
+ random32());
- /*
- * Initialize all inodes in this buffer and then log them.
- *
- * XXX: It would be much better if we had just one transaction to
- * log a whole cluster of inodes instead of all the individual
- * transactions causing a lot of log traffic.
- */
- xfs_biozero(fbuf, 0, ninodes << args.mp->m_sb.sb_inodelog);
- for (i = 0; i < ninodes; i++) {
- int ioffset = i << args.mp->m_sb.sb_inodelog;
- uint isize = sizeof(struct xfs_dinode);
-
- free = xfs_make_iptr(args.mp, fbuf, i);
- free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
- free->di_version = version;
- free->di_gen = cpu_to_be32(gen);
- free->di_next_unlinked = cpu_to_be32(NULLAGINO);
- xfs_trans_log_buf(tp, fbuf, ioffset, ioffset + isize - 1);
- }
- xfs_trans_inode_alloc_buf(tp, fbuf);
- }
+ /*
+ * Convert the results.
+ */
+ newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0);
be32_add_cpu(&agi->agi_count, newlen);
be32_add_cpu(&agi->agi_freecount, newlen);
- agno = be32_to_cpu(agi->agi_seqno);
down_read(&args.mp->m_peraglock);
args.mp->m_perag[agno].pagi_freecount += newlen;
up_read(&args.mp->m_peraglock);
agi->agi_newino = cpu_to_be32(newino);
+
/*
* Insert records describing the new inode chunk into the btree.
*/
@@ -380,13 +394,17 @@ xfs_ialloc_ag_alloc(
for (thisino = newino;
thisino < newino + newlen;
thisino += XFS_INODES_PER_CHUNK) {
- if ((error = xfs_inobt_lookup_eq(cur, thisino,
- XFS_INODES_PER_CHUNK, XFS_INOBT_ALL_FREE, &i))) {
+ cur->bc_rec.i.ir_startino = thisino;
+ cur->bc_rec.i.ir_freecount = XFS_INODES_PER_CHUNK;
+ cur->bc_rec.i.ir_free = XFS_INOBT_ALL_FREE;
+ error = xfs_btree_lookup(cur, XFS_LOOKUP_EQ, &i);
+ if (error) {
xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
return error;
}
ASSERT(i == 0);
- if ((error = xfs_btree_insert(cur, &i))) {
+ error = xfs_btree_insert(cur, &i);
+ if (error) {
xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
return error;
}
@@ -539,6 +557,62 @@ nextag:
}
/*
+ * Try to retrieve the next record to the left/right from the current one.
+ */
+STATIC int
+xfs_ialloc_next_rec(
+ struct xfs_btree_cur *cur,
+ xfs_inobt_rec_incore_t *rec,
+ int *done,
+ int left)
+{
+ int error;
+ int i;
+
+ if (left)
+ error = xfs_btree_decrement(cur, 0, &i);
+ else
+ error = xfs_btree_increment(cur, 0, &i);
+
+ if (error)
+ return error;
+ *done = !i;
+ if (i) {
+ error = xfs_inobt_get_rec(cur, rec, &i);
+ if (error)
+ return error;
+ XFS_WANT_CORRUPTED_RETURN(i == 1);
+ }
+
+ return 0;
+}
+
+STATIC int
+xfs_ialloc_get_rec(
+ struct xfs_btree_cur *cur,
+ xfs_agino_t agino,
+ xfs_inobt_rec_incore_t *rec,
+ int *done,
+ int left)
+{
+ int error;
+ int i;
+
+ error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_EQ, &i);
+ if (error)
+ return error;
+ *done = !i;
+ if (i) {
+ error = xfs_inobt_get_rec(cur, rec, &i);
+ if (error)
+ return error;
+ XFS_WANT_CORRUPTED_RETURN(i == 1);
+ }
+
+ return 0;
+}
+
+/*
* Visible inode allocation functions.
*/
@@ -592,8 +666,8 @@ xfs_dialloc(
int j; /* result code */
xfs_mount_t *mp; /* file system mount structure */
int offset; /* index of inode in chunk */
- xfs_agino_t pagino; /* parent's a.g. relative inode # */
- xfs_agnumber_t pagno; /* parent's allocation group number */
+ xfs_agino_t pagino; /* parent's AG relative inode # */
+ xfs_agnumber_t pagno; /* parent's AG number */
xfs_inobt_rec_incore_t rec; /* inode allocation record */
xfs_agnumber_t tagno; /* testing allocation group number */
xfs_btree_cur_t *tcur; /* temp cursor */
@@ -716,6 +790,8 @@ nextag:
*/
agno = tagno;
*IO_agbp = NULL;
+
+ restart_pagno:
cur = xfs_inobt_init_cursor(mp, tp, agbp, be32_to_cpu(agi->agi_seqno));
/*
* If pagino is 0 (this is the root inode allocation) use newino.
@@ -723,220 +799,199 @@ nextag:
*/
if (!pagino)
pagino = be32_to_cpu(agi->agi_newino);
-#ifdef DEBUG
- if (cur->bc_nlevels == 1) {
- int freecount = 0;
- if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i)))
- goto error0;
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
- do {
- if ((error = xfs_inobt_get_rec(cur, &rec.ir_startino,
- &rec.ir_freecount, &rec.ir_free, &i)))
- goto error0;
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
- freecount += rec.ir_freecount;
- if ((error = xfs_btree_increment(cur, 0, &i)))
- goto error0;
- } while (i == 1);
+ error = xfs_check_agi_freecount(cur, agi);
+ if (error)
+ goto error0;
- ASSERT(freecount == be32_to_cpu(agi->agi_freecount) ||
- XFS_FORCED_SHUTDOWN(mp));
- }
-#endif
/*
- * If in the same a.g. as the parent, try to get near the parent.
+ * If in the same AG as the parent, try to get near the parent.
*/
if (pagno == agno) {
- if ((error = xfs_inobt_lookup_le(cur, pagino, 0, 0, &i)))
+ xfs_perag_t *pag = &mp->m_perag[agno];
+ int doneleft; /* done, to the left */
+ int doneright; /* done, to the right */
+ int searchdistance = 10;
+
+ error = xfs_inobt_lookup(cur, pagino, XFS_LOOKUP_LE, &i);
+ if (error)
+ goto error0;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+ error = xfs_inobt_get_rec(cur, &rec, &j);
+ if (error)
goto error0;
- if (i != 0 &&
- (error = xfs_inobt_get_rec(cur, &rec.ir_startino,
- &rec.ir_freecount, &rec.ir_free, &j)) == 0 &&
- j == 1 &&
- rec.ir_freecount > 0) {
+ XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+ if (rec.ir_freecount > 0) {
/*
* Found a free inode in the same chunk
- * as parent, done.
+ * as the parent, done.
*/
+ goto alloc_inode;
}
+
+
+ /*
+ * In the same AG as parent, but parent's chunk is full.
+ */
+
+ /* duplicate the cursor, search left & right simultaneously */
+ error = xfs_btree_dup_cursor(cur, &tcur);
+ if (error)
+ goto error0;
+
/*
- * In the same a.g. as parent, but parent's chunk is full.
+ * Skip to last blocks looked up if same parent inode.
*/
- else {
- int doneleft; /* done, to the left */
- int doneright; /* done, to the right */
+ if (pagino != NULLAGINO &&
+ pag->pagl_pagino == pagino &&
+ pag->pagl_leftrec != NULLAGINO &&
+ pag->pagl_rightrec != NULLAGINO) {
+ error = xfs_ialloc_get_rec(tcur, pag->pagl_leftrec,
+ &trec, &doneleft, 1);
+ if (error)
+ goto error1;
+ error = xfs_ialloc_get_rec(cur, pag->pagl_rightrec,
+ &rec, &doneright, 0);
if (error)
- goto error0;
- ASSERT(i == 1);
- ASSERT(j == 1);
- /*
- * Duplicate the cursor, search left & right
- * simultaneously.
- */
- if ((error = xfs_btree_dup_cursor(cur, &tcur)))
- goto error0;
- /*
- * Search left with tcur, back up 1 record.
- */
- if ((error = xfs_btree_decrement(tcur, 0, &i)))
goto error1;
- doneleft = !i;
- if (!doneleft) {
- if ((error = xfs_inobt_get_rec(tcur,
- &trec.ir_startino,
- &trec.ir_freecount,
- &trec.ir_free, &i)))
- goto error1;
- XFS_WANT_CORRUPTED_GOTO(i == 1, error1);
- }
- /*
- * Search right with cur, go forward 1 record.
- */
- if ((error = xfs_btree_increment(cur, 0, &i)))
+ } else {
+ /* search left with tcur, back up 1 record */
+ error = xfs_ialloc_next_rec(tcur, &trec, &doneleft, 1);
+ if (error)
goto error1;
- doneright = !i;
- if (!doneright) {
- if ((error = xfs_inobt_get_rec(cur,
- &rec.ir_startino,
- &rec.ir_freecount,
- &rec.ir_free, &i)))
- goto error1;
- XFS_WANT_CORRUPTED_GOTO(i == 1, error1);
- }
- /*
- * Loop until we find the closest inode chunk
- * with a free one.
- */
- while (!doneleft || !doneright) {
- int useleft; /* using left inode
- chunk this time */
+ /* search right with cur, go forward 1 record. */
+ error = xfs_ialloc_next_rec(cur, &rec, &doneright, 0);
+ if (error)
+ goto error1;
+ }
+
+ /*
+ * Loop until we find an inode chunk with a free inode.
+ */
+ while (!doneleft || !doneright) {
+ int useleft; /* using left inode chunk this time */
+
+ if (!--searchdistance) {
/*
- * Figure out which block is closer,
- * if both are valid.
- */
- if (!doneleft && !doneright)
- useleft =
- pagino -
- (trec.ir_startino +
- XFS_INODES_PER_CHUNK - 1) <
- rec.ir_startino - pagino;
- else
- useleft = !doneleft;
- /*
- * If checking the left, does it have
- * free inodes?
- */
- if (useleft && trec.ir_freecount) {
- /*
- * Yes, set it up as the chunk to use.
- */
- rec = trec;
- xfs_btree_del_cursor(cur,
- XFS_BTREE_NOERROR);
- cur = tcur;
- break;
- }
- /*
- * If checking the right, does it have
- * free inodes?
- */
- if (!useleft && rec.ir_freecount) {
- /*
- * Yes, it's already set up.
- */
- xfs_btree_del_cursor(tcur,
- XFS_BTREE_NOERROR);
- break;
- }
- /*
- * If used the left, get another one
- * further left.
- */
- if (useleft) {
- if ((error = xfs_btree_decrement(tcur, 0,
- &i)))
- goto error1;
- doneleft = !i;
- if (!doneleft) {
- if ((error = xfs_inobt_get_rec(
- tcur,
- &trec.ir_startino,
- &trec.ir_freecount,
- &trec.ir_free, &i)))
- goto error1;
- XFS_WANT_CORRUPTED_GOTO(i == 1,
- error1);
- }
- }
- /*
- * If used the right, get another one
- * further right.
+ * Not in range - save last search
+ * location and allocate a new inode
*/
- else {
- if ((error = xfs_btree_increment(cur, 0,
- &i)))
- goto error1;
- doneright = !i;
- if (!doneright) {
- if ((error = xfs_inobt_get_rec(
- cur,
- &rec.ir_startino,
- &rec.ir_freecount,
- &rec.ir_free, &i)))
- goto error1;
- XFS_WANT_CORRUPTED_GOTO(i == 1,
- error1);
- }
- }
+ pag->pagl_leftrec = trec.ir_startino;
+ pag->pagl_rightrec = rec.ir_startino;
+ pag->pagl_pagino = pagino;
+ goto newino;
+ }
+
+ /* figure out the closer block if both are valid. */
+ if (!doneleft && !doneright) {
+ useleft = pagino -
+ (trec.ir_startino + XFS_INODES_PER_CHUNK - 1) <
+ rec.ir_startino - pagino;
+ } else {
+ useleft = !doneleft;
}
- ASSERT(!doneleft || !doneright);
+
+ /* free inodes to the left? */
+ if (useleft && trec.ir_freecount) {
+ rec = trec;
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+ cur = tcur;
+
+ pag->pagl_leftrec = trec.ir_startino;
+ pag->pagl_rightrec = rec.ir_startino;
+ pag->pagl_pagino = pagino;
+ goto alloc_inode;
+ }
+
+ /* free inodes to the right? */
+ if (!useleft && rec.ir_freecount) {
+ xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+
+ pag->pagl_leftrec = trec.ir_startino;
+ pag->pagl_rightrec = rec.ir_startino;
+ pag->pagl_pagino = pagino;
+ goto alloc_inode;
+ }
+
+ /* get next record to check */
+ if (useleft) {
+ error = xfs_ialloc_next_rec(tcur, &trec,
+ &doneleft, 1);
+ } else {
+ error = xfs_ialloc_next_rec(cur, &rec,
+ &doneright, 0);
+ }
+ if (error)
+ goto error1;
}
+
+ /*
+ * We've reached the end of the btree. because
+ * we are only searching a small chunk of the
+ * btree each search, there is obviously free
+ * inodes closer to the parent inode than we
+ * are now. restart the search again.
+ */
+ pag->pagl_pagino = NULLAGINO;
+ pag->pagl_leftrec = NULLAGINO;
+ pag->pagl_rightrec = NULLAGINO;
+ xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+ goto restart_pagno;
}
+
/*
- * In a different a.g. from the parent.
+ * In a different AG from the parent.
* See if the most recently allocated block has any free.
*/
- else if (be32_to_cpu(agi->agi_newino) != NULLAGINO) {
- if ((error = xfs_inobt_lookup_eq(cur,
- be32_to_cpu(agi->agi_newino), 0, 0, &i)))
+newino:
+ if (be32_to_cpu(agi->agi_newino) != NULLAGINO) {
+ error = xfs_inobt_lookup(cur, be32_to_cpu(agi->agi_newino),
+ XFS_LOOKUP_EQ, &i);
+ if (error)
goto error0;
- if (i == 1 &&
- (error = xfs_inobt_get_rec(cur, &rec.ir_startino,
- &rec.ir_freecount, &rec.ir_free, &j)) == 0 &&
- j == 1 &&
- rec.ir_freecount > 0) {
- /*
- * The last chunk allocated in the group still has
- * a free inode.
- */
- }
- /*
- * None left in the last group, search the whole a.g.
- */
- else {
+
+ if (i == 1) {
+ error = xfs_inobt_get_rec(cur, &rec, &j);
if (error)
goto error0;
- if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i)))
- goto error0;
- ASSERT(i == 1);
- for (;;) {
- if ((error = xfs_inobt_get_rec(cur,
- &rec.ir_startino,
- &rec.ir_freecount, &rec.ir_free,
- &i)))
- goto error0;
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
- if (rec.ir_freecount > 0)
- break;
- if ((error = xfs_btree_increment(cur, 0, &i)))
- goto error0;
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+ if (j == 1 && rec.ir_freecount > 0) {
+ /*
+ * The last chunk allocated in the group
+ * still has a free inode.
+ */
+ goto alloc_inode;
}
}
}
+
+ /*
+ * None left in the last group, search the whole AG
+ */
+ error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
+ if (error)
+ goto error0;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+ for (;;) {
+ error = xfs_inobt_get_rec(cur, &rec, &i);
+ if (error)
+ goto error0;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+ if (rec.ir_freecount > 0)
+ break;
+ error = xfs_btree_increment(cur, 0, &i);
+ if (error)
+ goto error0;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+ }
+
+alloc_inode:
offset = xfs_ialloc_find_free(&rec.ir_free);
ASSERT(offset >= 0);
ASSERT(offset < XFS_INODES_PER_CHUNK);
@@ -945,33 +1000,19 @@ nextag:
ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino + offset);
rec.ir_free &= ~XFS_INOBT_MASK(offset);
rec.ir_freecount--;
- if ((error = xfs_inobt_update(cur, rec.ir_startino, rec.ir_freecount,
- rec.ir_free)))
+ error = xfs_inobt_update(cur, &rec);
+ if (error)
goto error0;
be32_add_cpu(&agi->agi_freecount, -1);
xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
down_read(&mp->m_peraglock);
mp->m_perag[tagno].pagi_freecount--;
up_read(&mp->m_peraglock);
-#ifdef DEBUG
- if (cur->bc_nlevels == 1) {
- int freecount = 0;
- if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i)))
- goto error0;
- do {
- if ((error = xfs_inobt_get_rec(cur, &rec.ir_startino,
- &rec.ir_freecount, &rec.ir_free, &i)))
- goto error0;
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
- freecount += rec.ir_freecount;
- if ((error = xfs_btree_increment(cur, 0, &i)))
- goto error0;
- } while (i == 1);
- ASSERT(freecount == be32_to_cpu(agi->agi_freecount) ||
- XFS_FORCED_SHUTDOWN(mp));
- }
-#endif
+ error = xfs_check_agi_freecount(cur, agi);
+ if (error)
+ goto error0;
+
xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1);
*inop = ino;
@@ -1062,38 +1103,23 @@ xfs_difree(
* Initialize the cursor.
*/
cur = xfs_inobt_init_cursor(mp, tp, agbp, agno);
-#ifdef DEBUG
- if (cur->bc_nlevels == 1) {
- int freecount = 0;
- if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i)))
- goto error0;
- do {
- if ((error = xfs_inobt_get_rec(cur, &rec.ir_startino,
- &rec.ir_freecount, &rec.ir_free, &i)))
- goto error0;
- if (i) {
- freecount += rec.ir_freecount;
- if ((error = xfs_btree_increment(cur, 0, &i)))
- goto error0;
- }
- } while (i == 1);
- ASSERT(freecount == be32_to_cpu(agi->agi_freecount) ||
- XFS_FORCED_SHUTDOWN(mp));
- }
-#endif
+ error = xfs_check_agi_freecount(cur, agi);
+ if (error)
+ goto error0;
+
/*
* Look for the entry describing this inode.
*/
- if ((error = xfs_inobt_lookup_le(cur, agino, 0, 0, &i))) {
+ if ((error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i))) {
cmn_err(CE_WARN,
- "xfs_difree: xfs_inobt_lookup_le returned() an error %d on %s. Returning error.",
+ "xfs_difree: xfs_inobt_lookup returned() an error %d on %s. Returning error.",
error, mp->m_fsname);
goto error0;
}
XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
- if ((error = xfs_inobt_get_rec(cur, &rec.ir_startino, &rec.ir_freecount,
- &rec.ir_free, &i))) {
+ error = xfs_inobt_get_rec(cur, &rec, &i);
+ if (error) {
cmn_err(CE_WARN,
"xfs_difree: xfs_inobt_get_rec() returned an error %d on %s. Returning error.",
error, mp->m_fsname);
@@ -1148,12 +1174,14 @@ xfs_difree(
} else {
*delete = 0;
- if ((error = xfs_inobt_update(cur, rec.ir_startino, rec.ir_freecount, rec.ir_free))) {
+ error = xfs_inobt_update(cur, &rec);
+ if (error) {
cmn_err(CE_WARN,
- "xfs_difree: xfs_inobt_update() returned an error %d on %s. Returning error.",
+ "xfs_difree: xfs_inobt_update returned an error %d on %s.",
error, mp->m_fsname);
goto error0;
}
+
/*
* Change the inode free counts and log the ag/sb changes.
*/
@@ -1165,28 +1193,10 @@ xfs_difree(
xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, 1);
}
-#ifdef DEBUG
- if (cur->bc_nlevels == 1) {
- int freecount = 0;
+ error = xfs_check_agi_freecount(cur, agi);
+ if (error)
+ goto error0;
- if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i)))
- goto error0;
- do {
- if ((error = xfs_inobt_get_rec(cur,
- &rec.ir_startino,
- &rec.ir_freecount,
- &rec.ir_free, &i)))
- goto error0;
- if (i) {
- freecount += rec.ir_freecount;
- if ((error = xfs_btree_increment(cur, 0, &i)))
- goto error0;
- }
- } while (i == 1);
- ASSERT(freecount == be32_to_cpu(agi->agi_freecount) ||
- XFS_FORCED_SHUTDOWN(mp));
- }
-#endif
xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
return 0;
@@ -1297,9 +1307,7 @@ xfs_imap(
chunk_agbno = agbno - offset_agbno;
} else {
xfs_btree_cur_t *cur; /* inode btree cursor */
- xfs_agino_t chunk_agino; /* first agino in inode chunk */
- __int32_t chunk_cnt; /* count of free inodes in chunk */
- xfs_inofree_t chunk_free; /* mask of free inodes in chunk */
+ xfs_inobt_rec_incore_t chunk_rec;
xfs_buf_t *agbp; /* agi buffer */
int i; /* temp state */
@@ -1315,15 +1323,14 @@ xfs_imap(
}
cur = xfs_inobt_init_cursor(mp, tp, agbp, agno);
- error = xfs_inobt_lookup_le(cur, agino, 0, 0, &i);
+ error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i);
if (error) {
xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
- "xfs_inobt_lookup_le() failed");
+ "xfs_inobt_lookup() failed");
goto error0;
}
- error = xfs_inobt_get_rec(cur, &chunk_agino, &chunk_cnt,
- &chunk_free, &i);
+ error = xfs_inobt_get_rec(cur, &chunk_rec, &i);
if (error) {
xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
"xfs_inobt_get_rec() failed");
@@ -1341,7 +1348,7 @@ xfs_imap(
xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
if (error)
return error;
- chunk_agbno = XFS_AGINO_TO_AGBNO(mp, chunk_agino);
+ chunk_agbno = XFS_AGINO_TO_AGBNO(mp, chunk_rec.ir_startino);
offset_agbno = agbno - chunk_agbno;
}
diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h
index aeee8278f92c..bb5385475e1f 100644
--- a/fs/xfs/xfs_ialloc.h
+++ b/fs/xfs/xfs_ialloc.h
@@ -150,23 +150,15 @@ xfs_ialloc_pagi_init(
xfs_agnumber_t agno); /* allocation group number */
/*
- * Lookup the first record greater than or equal to ino
- * in the btree given by cur.
+ * Lookup a record by ino in the btree given by cur.
*/
-int xfs_inobt_lookup_ge(struct xfs_btree_cur *cur, xfs_agino_t ino,
- __int32_t fcnt, xfs_inofree_t free, int *stat);
-
-/*
- * Lookup the first record less than or equal to ino
- * in the btree given by cur.
- */
-int xfs_inobt_lookup_le(struct xfs_btree_cur *cur, xfs_agino_t ino,
- __int32_t fcnt, xfs_inofree_t free, int *stat);
+int xfs_inobt_lookup(struct xfs_btree_cur *cur, xfs_agino_t ino,
+ xfs_lookup_t dir, int *stat);
/*
* Get the data from the pointed-to record.
*/
-extern int xfs_inobt_get_rec(struct xfs_btree_cur *cur, xfs_agino_t *ino,
- __int32_t *fcnt, xfs_inofree_t *free, int *stat);
+extern int xfs_inobt_get_rec(struct xfs_btree_cur *cur,
+ xfs_inobt_rec_incore_t *rec, int *stat);
#endif /* __XFS_IALLOC_H__ */
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 34ec86923f7e..80e526489be5 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -82,7 +82,6 @@ xfs_inode_alloc(
memset(&ip->i_df, 0, sizeof(xfs_ifork_t));
ip->i_flags = 0;
ip->i_update_core = 0;
- ip->i_update_size = 0;
ip->i_delayed_blks = 0;
memset(&ip->i_d, 0, sizeof(xfs_icdinode_t));
ip->i_size = 0;
@@ -191,80 +190,82 @@ xfs_iget_cache_hit(
int flags,
int lock_flags) __releases(pag->pag_ici_lock)
{
+ struct inode *inode = VFS_I(ip);
struct xfs_mount *mp = ip->i_mount;
- int error = EAGAIN;
+ int error;
+
+ spin_lock(&ip->i_flags_lock);
/*
- * If INEW is set this inode is being set up
- * If IRECLAIM is set this inode is being torn down
- * Pause and try again.
+ * If we are racing with another cache hit that is currently
+ * instantiating this inode or currently recycling it out of
+ * reclaimabe state, wait for the initialisation to complete
+ * before continuing.
+ *
+ * XXX(hch): eventually we should do something equivalent to
+ * wait_on_inode to wait for these flags to be cleared
+ * instead of polling for it.
*/
- if (xfs_iflags_test(ip, (XFS_INEW|XFS_IRECLAIM))) {
+ if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) {
XFS_STATS_INC(xs_ig_frecycle);
+ error = EAGAIN;
goto out_error;
}
- /* If IRECLAIMABLE is set, we've torn down the vfs inode part */
- if (xfs_iflags_test(ip, XFS_IRECLAIMABLE)) {
-
- /*
- * If lookup is racing with unlink, then we should return an
- * error immediately so we don't remove it from the reclaim
- * list and potentially leak the inode.
- */
- if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
- error = ENOENT;
- goto out_error;
- }
+ /*
+ * If lookup is racing with unlink return an error immediately.
+ */
+ if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
+ error = ENOENT;
+ goto out_error;
+ }
+ /*
+ * If IRECLAIMABLE is set, we've torn down the VFS inode already.
+ * Need to carefully get it back into useable state.
+ */
+ if (ip->i_flags & XFS_IRECLAIMABLE) {
xfs_itrace_exit_tag(ip, "xfs_iget.alloc");
/*
- * We need to re-initialise the VFS inode as it has been
- * 'freed' by the VFS. Do this here so we can deal with
- * errors cleanly, then tag it so it can be set up correctly
- * later.
+ * We need to set XFS_INEW atomically with clearing the
+ * reclaimable tag so that we do have an indicator of the
+ * inode still being initialized.
*/
- if (inode_init_always(mp->m_super, VFS_I(ip))) {
- error = ENOMEM;
- goto out_error;
- }
+ ip->i_flags |= XFS_INEW;
+ ip->i_flags &= ~XFS_IRECLAIMABLE;
+ __xfs_inode_clear_reclaim_tag(mp, pag, ip);
- /*
- * We must set the XFS_INEW flag before clearing the
- * XFS_IRECLAIMABLE flag so that if a racing lookup does
- * not find the XFS_IRECLAIMABLE above but has the igrab()
- * below succeed we can safely check XFS_INEW to detect
- * that this inode is still being initialised.
- */
- xfs_iflags_set(ip, XFS_INEW);
- xfs_iflags_clear(ip, XFS_IRECLAIMABLE);
+ spin_unlock(&ip->i_flags_lock);
+ read_unlock(&pag->pag_ici_lock);
- /* clear the radix tree reclaim flag as well. */
- __xfs_inode_clear_reclaim_tag(mp, pag, ip);
- } else if (!igrab(VFS_I(ip))) {
+ error = -inode_init_always(mp->m_super, inode);
+ if (error) {
+ /*
+ * Re-initializing the inode failed, and we are in deep
+ * trouble. Try to re-add it to the reclaim list.
+ */
+ read_lock(&pag->pag_ici_lock);
+ spin_lock(&ip->i_flags_lock);
+
+ ip->i_flags &= ~XFS_INEW;
+ ip->i_flags |= XFS_IRECLAIMABLE;
+ __xfs_inode_set_reclaim_tag(pag, ip);
+ goto out_error;
+ }
+ inode->i_state = I_LOCK|I_NEW;
+ } else {
/* If the VFS inode is being torn down, pause and try again. */
- XFS_STATS_INC(xs_ig_frecycle);
- goto out_error;
- } else if (xfs_iflags_test(ip, XFS_INEW)) {
- /*
- * We are racing with another cache hit that is
- * currently recycling this inode out of the XFS_IRECLAIMABLE
- * state. Wait for the initialisation to complete before
- * continuing.
- */
- wait_on_inode(VFS_I(ip));
- }
+ if (!igrab(inode)) {
+ error = EAGAIN;
+ goto out_error;
+ }
- if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
- error = ENOENT;
- iput(VFS_I(ip));
- goto out_error;
+ /* We've got a live one. */
+ spin_unlock(&ip->i_flags_lock);
+ read_unlock(&pag->pag_ici_lock);
}
- /* We've got a live one. */
- read_unlock(&pag->pag_ici_lock);
-
if (lock_flags != 0)
xfs_ilock(ip, lock_flags);
@@ -274,6 +275,7 @@ xfs_iget_cache_hit(
return 0;
out_error:
+ spin_unlock(&ip->i_flags_lock);
read_unlock(&pag->pag_ici_lock);
return error;
}
@@ -453,32 +455,6 @@ out_error_or_again:
return error;
}
-
-/*
- * Look for the inode corresponding to the given ino in the hash table.
- * If it is there and its i_transp pointer matches tp, return it.
- * Otherwise, return NULL.
- */
-xfs_inode_t *
-xfs_inode_incore(xfs_mount_t *mp,
- xfs_ino_t ino,
- xfs_trans_t *tp)
-{
- xfs_inode_t *ip;
- xfs_perag_t *pag;
-
- pag = xfs_get_perag(mp, ino);
- read_lock(&pag->pag_ici_lock);
- ip = radix_tree_lookup(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, ino));
- read_unlock(&pag->pag_ici_lock);
- xfs_put_perag(mp, pag);
-
- /* the returned inode must match the transaction */
- if (ip && (ip->i_transp != tp))
- return NULL;
- return ip;
-}
-
/*
* Decrement reference count of an inode structure and unlock it.
*
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 1f22d65fed0a..c1dc7ef5a1d8 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -343,6 +343,16 @@ xfs_iformat(
return XFS_ERROR(EFSCORRUPTED);
}
+ if (unlikely((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) &&
+ !ip->i_mount->m_rtdev_targp)) {
+ xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
+ "corrupt dinode %Lu, has realtime flag set.",
+ ip->i_ino);
+ XFS_CORRUPTION_ERROR("xfs_iformat(realtime)",
+ XFS_ERRLEVEL_LOW, ip->i_mount, dip);
+ return XFS_ERROR(EFSCORRUPTED);
+ }
+
switch (ip->i_d.di_mode & S_IFMT) {
case S_IFIFO:
case S_IFCHR:
@@ -641,7 +651,7 @@ xfs_iformat_btree(
return 0;
}
-void
+STATIC void
xfs_dinode_from_disk(
xfs_icdinode_t *to,
xfs_dinode_t *from)
@@ -1237,7 +1247,7 @@ xfs_isize_check(
* In that case the pages will still be in memory, but the inode size
* will never have been updated.
*/
-xfs_fsize_t
+STATIC xfs_fsize_t
xfs_file_last_byte(
xfs_inode_t *ip)
{
@@ -3827,7 +3837,7 @@ xfs_iext_inline_to_direct(
/*
* Resize an extent indirection array to new_size bytes.
*/
-void
+STATIC void
xfs_iext_realloc_indirect(
xfs_ifork_t *ifp, /* inode fork pointer */
int new_size) /* new indirection array size */
@@ -3852,7 +3862,7 @@ xfs_iext_realloc_indirect(
/*
* Switch from indirection array to linear (direct) extent allocations.
*/
-void
+STATIC void
xfs_iext_indirect_to_direct(
xfs_ifork_t *ifp) /* inode fork pointer */
{
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 65f24a3cc992..0b38b9a869ec 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -261,7 +261,6 @@ typedef struct xfs_inode {
/* Miscellaneous state. */
unsigned short i_flags; /* see defined flags below */
unsigned char i_update_core; /* timestamps/size is dirty */
- unsigned char i_update_size; /* di_size field is dirty */
unsigned int i_delayed_blks; /* count of delay alloc blks */
xfs_icdinode_t i_d; /* most of ondisk inode */
@@ -468,8 +467,6 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
/*
* xfs_iget.c prototypes.
*/
-xfs_inode_t *xfs_inode_incore(struct xfs_mount *, xfs_ino_t,
- struct xfs_trans *);
int xfs_iget(struct xfs_mount *, struct xfs_trans *, xfs_ino_t,
uint, uint, xfs_inode_t **, xfs_daddr_t);
void xfs_iput(xfs_inode_t *, uint);
@@ -504,7 +501,6 @@ void xfs_ipin(xfs_inode_t *);
void xfs_iunpin(xfs_inode_t *);
int xfs_iflush(xfs_inode_t *, uint);
void xfs_ichgtime(xfs_inode_t *, int);
-xfs_fsize_t xfs_file_last_byte(xfs_inode_t *);
void xfs_lock_inodes(xfs_inode_t **, int, uint);
void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint);
@@ -572,8 +568,6 @@ int xfs_itobp(struct xfs_mount *, struct xfs_trans *,
struct xfs_buf **, uint);
int xfs_iread(struct xfs_mount *, struct xfs_trans *,
struct xfs_inode *, xfs_daddr_t, uint);
-void xfs_dinode_from_disk(struct xfs_icdinode *,
- struct xfs_dinode *);
void xfs_dinode_to_disk(struct xfs_dinode *,
struct xfs_icdinode *);
void xfs_idestroy_fork(struct xfs_inode *, int);
@@ -592,8 +586,6 @@ void xfs_iext_remove_inline(xfs_ifork_t *, xfs_extnum_t, int);
void xfs_iext_remove_direct(xfs_ifork_t *, xfs_extnum_t, int);
void xfs_iext_remove_indirect(xfs_ifork_t *, xfs_extnum_t, int);
void xfs_iext_realloc_direct(xfs_ifork_t *, int);
-void xfs_iext_realloc_indirect(xfs_ifork_t *, int);
-void xfs_iext_indirect_to_direct(xfs_ifork_t *);
void xfs_iext_direct_to_inline(xfs_ifork_t *, xfs_extnum_t);
void xfs_iext_inline_to_direct(xfs_ifork_t *, int);
void xfs_iext_destroy(xfs_ifork_t *);
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 977c4aec587e..47d5b663c37e 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -263,14 +263,6 @@ xfs_inode_item_format(
}
/*
- * We don't have to worry about re-ordering here because
- * the update_size field is protected by the inode lock
- * and we have that held in exclusive mode.
- */
- if (ip->i_update_size)
- ip->i_update_size = 0;
-
- /*
* Make sure to get the latest atime from the Linux inode.
*/
xfs_synchronize_atime(ip);
@@ -712,8 +704,6 @@ xfs_inode_item_unlock(
* Clear out the fields of the inode log item particular
* to the current transaction.
*/
- iip->ili_ilock_recur = 0;
- iip->ili_iolock_recur = 0;
iip->ili_flags = 0;
/*
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index a52ac125f055..65bae4c9b8bf 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -137,8 +137,6 @@ typedef struct xfs_inode_log_item {
struct xfs_inode *ili_inode; /* inode ptr */
xfs_lsn_t ili_flush_lsn; /* lsn at last flush */
xfs_lsn_t ili_last_lsn; /* lsn at last transaction */
- unsigned short ili_ilock_recur; /* lock recursion count */
- unsigned short ili_iolock_recur; /* lock recursion count */
unsigned short ili_flags; /* misc flags */
unsigned short ili_logged; /* flushed logged data */
unsigned int ili_last_fields; /* fields when flushed */
diff --git a/fs/xfs/xfs_inum.h b/fs/xfs/xfs_inum.h
index 7a28191cb0de..b8e4ee4e89a4 100644
--- a/fs/xfs/xfs_inum.h
+++ b/fs/xfs/xfs_inum.h
@@ -72,7 +72,6 @@ struct xfs_mount;
#if XFS_BIG_INUMS
#define XFS_MAXINUMBER ((xfs_ino_t)((1ULL << 56) - 1ULL))
-#define XFS_INO64_OFFSET ((xfs_ino_t)(1ULL << 32))
#else
#define XFS_MAXINUMBER ((xfs_ino_t)((1ULL << 32) - 1ULL))
#endif
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index aeb2d2221c7d..b68f9107e26c 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -39,7 +39,7 @@
#include "xfs_error.h"
#include "xfs_btree.h"
-int
+STATIC int
xfs_internal_inum(
xfs_mount_t *mp,
xfs_ino_t ino)
@@ -353,9 +353,6 @@ xfs_bulkstat(
int end_of_ag; /* set if we've seen the ag end */
int error; /* error code */
int fmterror;/* bulkstat formatter result */
- __int32_t gcnt; /* current btree rec's count */
- xfs_inofree_t gfree; /* current btree rec's free mask */
- xfs_agino_t gino; /* current btree rec's start inode */
int i; /* loop index */
int icount; /* count of inodes good in irbuf */
size_t irbsize; /* size of irec buffer in bytes */
@@ -442,40 +439,43 @@ xfs_bulkstat(
* we need to get the remainder of the chunk we're in.
*/
if (agino > 0) {
+ xfs_inobt_rec_incore_t r;
+
/*
* Lookup the inode chunk that this inode lives in.
*/
- error = xfs_inobt_lookup_le(cur, agino, 0, 0, &tmp);
+ error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE,
+ &tmp);
if (!error && /* no I/O error */
tmp && /* lookup succeeded */
/* got the record, should always work */
- !(error = xfs_inobt_get_rec(cur, &gino, &gcnt,
- &gfree, &i)) &&
+ !(error = xfs_inobt_get_rec(cur, &r, &i)) &&
i == 1 &&
/* this is the right chunk */
- agino < gino + XFS_INODES_PER_CHUNK &&
+ agino < r.ir_startino + XFS_INODES_PER_CHUNK &&
/* lastino was not last in chunk */
- (chunkidx = agino - gino + 1) <
+ (chunkidx = agino - r.ir_startino + 1) <
XFS_INODES_PER_CHUNK &&
/* there are some left allocated */
xfs_inobt_maskn(chunkidx,
- XFS_INODES_PER_CHUNK - chunkidx) & ~gfree) {
+ XFS_INODES_PER_CHUNK - chunkidx) &
+ ~r.ir_free) {
/*
* Grab the chunk record. Mark all the
* uninteresting inodes (because they're
* before our start point) free.
*/
for (i = 0; i < chunkidx; i++) {
- if (XFS_INOBT_MASK(i) & ~gfree)
- gcnt++;
+ if (XFS_INOBT_MASK(i) & ~r.ir_free)
+ r.ir_freecount++;
}
- gfree |= xfs_inobt_maskn(0, chunkidx);
- irbp->ir_startino = gino;
- irbp->ir_freecount = gcnt;
- irbp->ir_free = gfree;
+ r.ir_free |= xfs_inobt_maskn(0, chunkidx);
+ irbp->ir_startino = r.ir_startino;
+ irbp->ir_freecount = r.ir_freecount;
+ irbp->ir_free = r.ir_free;
irbp++;
- agino = gino + XFS_INODES_PER_CHUNK;
- icount = XFS_INODES_PER_CHUNK - gcnt;
+ agino = r.ir_startino + XFS_INODES_PER_CHUNK;
+ icount = XFS_INODES_PER_CHUNK - r.ir_freecount;
} else {
/*
* If any of those tests failed, bump the
@@ -493,7 +493,7 @@ xfs_bulkstat(
/*
* Start of ag. Lookup the first inode chunk.
*/
- error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &tmp);
+ error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &tmp);
icount = 0;
}
/*
@@ -501,6 +501,8 @@ xfs_bulkstat(
* until we run out of inodes or space in the buffer.
*/
while (irbp < irbufend && icount < ubcount) {
+ xfs_inobt_rec_incore_t r;
+
/*
* Loop as long as we're unable to read the
* inode btree.
@@ -510,51 +512,55 @@ xfs_bulkstat(
if (XFS_AGINO_TO_AGBNO(mp, agino) >=
be32_to_cpu(agi->agi_length))
break;
- error = xfs_inobt_lookup_ge(cur, agino, 0, 0,
- &tmp);
+ error = xfs_inobt_lookup(cur, agino,
+ XFS_LOOKUP_GE, &tmp);
cond_resched();
}
/*
* If ran off the end of the ag either with an error,
* or the normal way, set end and stop collecting.
*/
- if (error ||
- (error = xfs_inobt_get_rec(cur, &gino, &gcnt,
- &gfree, &i)) ||
- i == 0) {
+ if (error) {
end_of_ag = 1;
break;
}
+
+ error = xfs_inobt_get_rec(cur, &r, &i);
+ if (error || i == 0) {
+ end_of_ag = 1;
+ break;
+ }
+
/*
* If this chunk has any allocated inodes, save it.
* Also start read-ahead now for this chunk.
*/
- if (gcnt < XFS_INODES_PER_CHUNK) {
+ if (r.ir_freecount < XFS_INODES_PER_CHUNK) {
/*
* Loop over all clusters in the next chunk.
* Do a readahead if there are any allocated
* inodes in that cluster.
*/
- for (agbno = XFS_AGINO_TO_AGBNO(mp, gino),
- chunkidx = 0;
+ agbno = XFS_AGINO_TO_AGBNO(mp, r.ir_startino);
+ for (chunkidx = 0;
chunkidx < XFS_INODES_PER_CHUNK;
chunkidx += nicluster,
agbno += nbcluster) {
- if (xfs_inobt_maskn(chunkidx,
- nicluster) & ~gfree)
+ if (xfs_inobt_maskn(chunkidx, nicluster)
+ & ~r.ir_free)
xfs_btree_reada_bufs(mp, agno,
agbno, nbcluster);
}
- irbp->ir_startino = gino;
- irbp->ir_freecount = gcnt;
- irbp->ir_free = gfree;
+ irbp->ir_startino = r.ir_startino;
+ irbp->ir_freecount = r.ir_freecount;
+ irbp->ir_free = r.ir_free;
irbp++;
- icount += XFS_INODES_PER_CHUNK - gcnt;
+ icount += XFS_INODES_PER_CHUNK - r.ir_freecount;
}
/*
* Set agino to after this chunk and bump the cursor.
*/
- agino = gino + XFS_INODES_PER_CHUNK;
+ agino = r.ir_startino + XFS_INODES_PER_CHUNK;
error = xfs_btree_increment(cur, 0, &tmp);
cond_resched();
}
@@ -820,9 +826,7 @@ xfs_inumbers(
int bufidx;
xfs_btree_cur_t *cur;
int error;
- __int32_t gcnt;
- xfs_inofree_t gfree;
- xfs_agino_t gino;
+ xfs_inobt_rec_incore_t r;
int i;
xfs_ino_t ino;
int left;
@@ -855,7 +859,8 @@ xfs_inumbers(
continue;
}
cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno);
- error = xfs_inobt_lookup_ge(cur, agino, 0, 0, &tmp);
+ error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_GE,
+ &tmp);
if (error) {
xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
cur = NULL;
@@ -870,9 +875,8 @@ xfs_inumbers(
continue;
}
}
- if ((error = xfs_inobt_get_rec(cur, &gino, &gcnt, &gfree,
- &i)) ||
- i == 0) {
+ error = xfs_inobt_get_rec(cur, &r, &i);
+ if (error || i == 0) {
xfs_buf_relse(agbp);
agbp = NULL;
xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
@@ -881,10 +885,12 @@ xfs_inumbers(
agino = 0;
continue;
}
- agino = gino + XFS_INODES_PER_CHUNK - 1;
- buffer[bufidx].xi_startino = XFS_AGINO_TO_INO(mp, agno, gino);
- buffer[bufidx].xi_alloccount = XFS_INODES_PER_CHUNK - gcnt;
- buffer[bufidx].xi_allocmask = ~gfree;
+ agino = r.ir_startino + XFS_INODES_PER_CHUNK - 1;
+ buffer[bufidx].xi_startino =
+ XFS_AGINO_TO_INO(mp, agno, r.ir_startino);
+ buffer[bufidx].xi_alloccount =
+ XFS_INODES_PER_CHUNK - r.ir_freecount;
+ buffer[bufidx].xi_allocmask = ~r.ir_free;
bufidx++;
left--;
if (bufidx == bcount) {
diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h
index 1fb04e7deb61..20792bf45946 100644
--- a/fs/xfs/xfs_itable.h
+++ b/fs/xfs/xfs_itable.h
@@ -99,11 +99,6 @@ xfs_bulkstat_one(
void *dibuff,
int *stat);
-int
-xfs_internal_inum(
- xfs_mount_t *mp,
- xfs_ino_t ino);
-
typedef int (*inumbers_fmt_pf)(
void __user *ubuffer, /* buffer to write to */
const xfs_inogrp_t *buffer, /* buffer to read from */
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 3750f04ede0b..9dbdff3ea484 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -3180,7 +3180,7 @@ try_again:
STATIC void
xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog)
{
- ASSERT(spin_is_locked(&log->l_icloglock));
+ assert_spin_locked(&log->l_icloglock);
if (iclog->ic_state == XLOG_STATE_ACTIVE) {
xlog_state_switch_iclogs(log, iclog, 0);
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index bcad5f4c1fd1..679c7c4926a2 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -451,8 +451,6 @@ extern int xlog_find_tail(xlog_t *log,
extern int xlog_recover(xlog_t *log);
extern int xlog_recover_finish(xlog_t *log);
extern void xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int);
-extern void xlog_recover_process_iunlinks(xlog_t *log);
-
extern struct xfs_buf *xlog_get_bp(xlog_t *, int);
extern void xlog_put_bp(struct xfs_buf *);
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 47da2fb45377..1099395d7d6c 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -3263,7 +3263,7 @@ xlog_recover_process_one_iunlink(
* freeing of the inode and its removal from the list must be
* atomic.
*/
-void
+STATIC void
xlog_recover_process_iunlinks(
xlog_t *log)
{
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 5c6f092659c1..8b6c9e807efb 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1568,7 +1568,7 @@ xfs_mod_sb(xfs_trans_t *tp, __int64_t fields)
*
* The m_sb_lock must be held when this routine is called.
*/
-int
+STATIC int
xfs_mod_incore_sb_unlocked(
xfs_mount_t *mp,
xfs_sb_field_t field,
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index a5122382afde..a6c023bc0fb2 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -414,13 +414,10 @@ typedef struct xfs_mod_sb {
extern int xfs_log_sbcount(xfs_mount_t *, uint);
extern int xfs_mountfs(xfs_mount_t *mp);
-extern void xfs_mountfs_check_barriers(xfs_mount_t *mp);
extern void xfs_unmountfs(xfs_mount_t *);
extern int xfs_unmountfs_writesb(xfs_mount_t *);
extern int xfs_mod_incore_sb(xfs_mount_t *, xfs_sb_field_t, int64_t, int);
-extern int xfs_mod_incore_sb_unlocked(xfs_mount_t *, xfs_sb_field_t,
- int64_t, int);
extern int xfs_mod_incore_sb_batch(xfs_mount_t *, xfs_mod_sb_t *,
uint, int);
extern int xfs_mount_log_sb(xfs_mount_t *, __int64_t);
diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c
index afee7eb24323..4b0613d99faa 100644
--- a/fs/xfs/xfs_mru_cache.c
+++ b/fs/xfs/xfs_mru_cache.c
@@ -564,35 +564,6 @@ xfs_mru_cache_lookup(
}
/*
- * To look up an element using its key, but leave its location in the internal
- * lists alone, call xfs_mru_cache_peek(). If the element isn't found, this
- * function returns NULL.
- *
- * See the comments above the declaration of the xfs_mru_cache_lookup() function
- * for important locking information pertaining to this call.
- */
-void *
-xfs_mru_cache_peek(
- xfs_mru_cache_t *mru,
- unsigned long key)
-{
- xfs_mru_cache_elem_t *elem;
-
- ASSERT(mru && mru->lists);
- if (!mru || !mru->lists)
- return NULL;
-
- spin_lock(&mru->lock);
- elem = radix_tree_lookup(&mru->store, key);
- if (!elem)
- spin_unlock(&mru->lock);
- else
- __release(mru_lock); /* help sparse not be stupid */
-
- return elem ? elem->value : NULL;
-}
-
-/*
* To release the internal data structure spinlock after having performed an
* xfs_mru_cache_lookup() or an xfs_mru_cache_peek(), call xfs_mru_cache_done()
* with the data store pointer.
diff --git a/fs/xfs/xfs_mru_cache.h b/fs/xfs/xfs_mru_cache.h
index dd58ea1bbebe..5d439f34b0c9 100644
--- a/fs/xfs/xfs_mru_cache.h
+++ b/fs/xfs/xfs_mru_cache.h
@@ -49,7 +49,6 @@ int xfs_mru_cache_insert(struct xfs_mru_cache *mru, unsigned long key,
void * xfs_mru_cache_remove(struct xfs_mru_cache *mru, unsigned long key);
void xfs_mru_cache_delete(struct xfs_mru_cache *mru, unsigned long key);
void *xfs_mru_cache_lookup(struct xfs_mru_cache *mru, unsigned long key);
-void *xfs_mru_cache_peek(struct xfs_mru_cache *mru, unsigned long key);
void xfs_mru_cache_done(struct xfs_mru_cache *mru);
#endif /* __XFS_MRU_CACHE_H__ */
diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c
index fea68615ed23..3f816ad7ff19 100644
--- a/fs/xfs/xfs_rw.c
+++ b/fs/xfs/xfs_rw.c
@@ -88,90 +88,6 @@ xfs_write_clear_setuid(
}
/*
- * Handle logging requirements of various synchronous types of write.
- */
-int
-xfs_write_sync_logforce(
- xfs_mount_t *mp,
- xfs_inode_t *ip)
-{
- int error = 0;
-
- /*
- * If we're treating this as O_DSYNC and we have not updated the
- * size, force the log.
- */
- if (!(mp->m_flags & XFS_MOUNT_OSYNCISOSYNC) &&
- !(ip->i_update_size)) {
- xfs_inode_log_item_t *iip = ip->i_itemp;
-
- /*
- * If an allocation transaction occurred
- * without extending the size, then we have to force
- * the log up the proper point to ensure that the
- * allocation is permanent. We can't count on
- * the fact that buffered writes lock out direct I/O
- * writes - the direct I/O write could have extended
- * the size nontransactionally, then finished before
- * we started. xfs_write_file will think that the file
- * didn't grow but the update isn't safe unless the
- * size change is logged.
- *
- * Force the log if we've committed a transaction
- * against the inode or if someone else has and
- * the commit record hasn't gone to disk (e.g.
- * the inode is pinned). This guarantees that
- * all changes affecting the inode are permanent
- * when we return.
- */
- if (iip && iip->ili_last_lsn) {
- error = _xfs_log_force(mp, iip->ili_last_lsn,
- XFS_LOG_FORCE | XFS_LOG_SYNC, NULL);
- } else if (xfs_ipincount(ip) > 0) {
- error = _xfs_log_force(mp, (xfs_lsn_t)0,
- XFS_LOG_FORCE | XFS_LOG_SYNC, NULL);
- }
-
- } else {
- xfs_trans_t *tp;
-
- /*
- * O_SYNC or O_DSYNC _with_ a size update are handled
- * the same way.
- *
- * If the write was synchronous then we need to make
- * sure that the inode modification time is permanent.
- * We'll have updated the timestamp above, so here
- * we use a synchronous transaction to log the inode.
- * It's not fast, but it's necessary.
- *
- * If this a dsync write and the size got changed
- * non-transactionally, then we need to ensure that
- * the size change gets logged in a synchronous
- * transaction.
- */
- tp = xfs_trans_alloc(mp, XFS_TRANS_WRITE_SYNC);
- if ((error = xfs_trans_reserve(tp, 0,
- XFS_SWRITE_LOG_RES(mp),
- 0, 0, 0))) {
- /* Transaction reserve failed */
- xfs_trans_cancel(tp, 0);
- } else {
- /* Transaction reserve successful */
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
- xfs_trans_ihold(tp, ip);
- xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- xfs_trans_set_sync(tp);
- error = xfs_trans_commit(tp, 0);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- }
- }
-
- return error;
-}
-
-/*
* Force a shutdown of the filesystem instantly while keeping
* the filesystem consistent. We don't do an unmount here; just shutdown
* the shop, make sure that absolutely nothing persistent happens to
diff --git a/fs/xfs/xfs_rw.h b/fs/xfs/xfs_rw.h
index f76c003ec55d..f5e4874c37d8 100644
--- a/fs/xfs/xfs_rw.h
+++ b/fs/xfs/xfs_rw.h
@@ -68,7 +68,6 @@ xfs_get_extsz_hint(
* Prototypes for functions in xfs_rw.c.
*/
extern int xfs_write_clear_setuid(struct xfs_inode *ip);
-extern int xfs_write_sync_logforce(struct xfs_mount *mp, struct xfs_inode *ip);
extern int xfs_bwrite(struct xfs_mount *mp, struct xfs_buf *bp);
extern int xfs_bioerror(struct xfs_buf *bp);
extern int xfs_bioerror_relse(struct xfs_buf *bp);
@@ -78,10 +77,4 @@ extern int xfs_read_buf(struct xfs_mount *mp, xfs_buftarg_t *btp,
extern void xfs_ioerror_alert(char *func, struct xfs_mount *mp,
xfs_buf_t *bp, xfs_daddr_t blkno);
-/*
- * Prototypes for functions in xfs_vnodeops.c.
- */
-extern int xfs_free_eofblocks(struct xfs_mount *mp, struct xfs_inode *ip,
- int flags);
-
#endif /* __XFS_RW_H__ */
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 775249a54f6f..ed47fc77759c 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -68,7 +68,7 @@ typedef struct xfs_trans_header {
#define XFS_TRANS_GROWFS 14
#define XFS_TRANS_STRAT_WRITE 15
#define XFS_TRANS_DIOSTRAT 16
-#define XFS_TRANS_WRITE_SYNC 17
+/* 17 was XFS_TRANS_WRITE_SYNC */
#define XFS_TRANS_WRITEID 18
#define XFS_TRANS_ADDAFORK 19
#define XFS_TRANS_ATTRINVAL 20
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 8ee2f8c8b0a6..218829e6a152 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -307,7 +307,7 @@ xfs_trans_read_buf(
return (flags & XFS_BUF_TRYLOCK) ?
EAGAIN : XFS_ERROR(ENOMEM);
- if ((bp != NULL) && (XFS_BUF_GETERROR(bp) != 0)) {
+ if (XFS_BUF_GETERROR(bp) != 0) {
xfs_ioerror_alert("xfs_trans_read_buf", mp,
bp, blkno);
error = XFS_BUF_GETERROR(bp);
@@ -315,7 +315,7 @@ xfs_trans_read_buf(
return error;
}
#ifdef DEBUG
- if (xfs_do_error && (bp != NULL)) {
+ if (xfs_do_error) {
if (xfs_error_target == target) {
if (((xfs_req_num++) % xfs_error_mod) == 0) {
xfs_buf_relse(bp);
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index 23d276af2e0c..785ff101da0a 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -49,30 +49,7 @@ xfs_trans_inode_broot_debug(
/*
- * Get and lock the inode for the caller if it is not already
- * locked within the given transaction. If it is already locked
- * within the transaction, just increment its lock recursion count
- * and return a pointer to it.
- *
- * For an inode to be locked in a transaction, the inode lock, as
- * opposed to the io lock, must be taken exclusively. This ensures
- * that the inode can be involved in only 1 transaction at a time.
- * Lock recursion is handled on the io lock, but only for lock modes
- * of equal or lesser strength. That is, you can recur on the io lock
- * held EXCL with a SHARED request but not vice versa. Also, if
- * the inode is already a part of the transaction then you cannot
- * go from not holding the io lock to having it EXCL or SHARED.
- *
- * Use the inode cache routine xfs_inode_incore() to find the inode
- * if it is already owned by this transaction.
- *
- * If we don't already own the inode, use xfs_iget() to get it.
- * Since the inode log item structure is embedded in the incore
- * inode structure and is initialized when the inode is brought
- * into memory, there is nothing to do with it here.
- *
- * If the given transaction pointer is NULL, just call xfs_iget().
- * This simplifies code which must handle both cases.
+ * Get an inode and join it to the transaction.
*/
int
xfs_trans_iget(
@@ -84,62 +61,11 @@ xfs_trans_iget(
xfs_inode_t **ipp)
{
int error;
- xfs_inode_t *ip;
-
- /*
- * If the transaction pointer is NULL, just call the normal
- * xfs_iget().
- */
- if (tp == NULL)
- return xfs_iget(mp, NULL, ino, flags, lock_flags, ipp, 0);
-
- /*
- * If we find the inode in core with this transaction
- * pointer in its i_transp field, then we know we already
- * have it locked. In this case we just increment the lock
- * recursion count and return the inode to the caller.
- * Assert that the inode is already locked in the mode requested
- * by the caller. We cannot do lock promotions yet, so
- * die if someone gets this wrong.
- */
- if ((ip = xfs_inode_incore(tp->t_mountp, ino, tp)) != NULL) {
- /*
- * Make sure that the inode lock is held EXCL and
- * that the io lock is never upgraded when the inode
- * is already a part of the transaction.
- */
- ASSERT(ip->i_itemp != NULL);
- ASSERT(lock_flags & XFS_ILOCK_EXCL);
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
- ASSERT((!(lock_flags & XFS_IOLOCK_EXCL)) ||
- xfs_isilocked(ip, XFS_IOLOCK_EXCL));
- ASSERT((!(lock_flags & XFS_IOLOCK_EXCL)) ||
- (ip->i_itemp->ili_flags & XFS_ILI_IOLOCKED_EXCL));
- ASSERT((!(lock_flags & XFS_IOLOCK_SHARED)) ||
- xfs_isilocked(ip, XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED));
- ASSERT((!(lock_flags & XFS_IOLOCK_SHARED)) ||
- (ip->i_itemp->ili_flags & XFS_ILI_IOLOCKED_ANY));
-
- if (lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) {
- ip->i_itemp->ili_iolock_recur++;
- }
- if (lock_flags & XFS_ILOCK_EXCL) {
- ip->i_itemp->ili_ilock_recur++;
- }
- *ipp = ip;
- return 0;
- }
-
- ASSERT(lock_flags & XFS_ILOCK_EXCL);
- error = xfs_iget(tp->t_mountp, tp, ino, flags, lock_flags, &ip, 0);
- if (error) {
- return error;
- }
- ASSERT(ip != NULL);
- xfs_trans_ijoin(tp, ip, lock_flags);
- *ipp = ip;
- return 0;
+ error = xfs_iget(mp, tp, ino, flags, lock_flags, ipp, 0);
+ if (!error && tp)
+ xfs_trans_ijoin(tp, *ipp, lock_flags);
+ return error;
}
/*
@@ -163,8 +89,6 @@ xfs_trans_ijoin(
xfs_inode_item_init(ip, ip->i_mount);
iip = ip->i_itemp;
ASSERT(iip->ili_flags == 0);
- ASSERT(iip->ili_ilock_recur == 0);
- ASSERT(iip->ili_iolock_recur == 0);
/*
* Get a log_item_desc to point at the new item.
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index c4eca5ed5dab..a434f287962d 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -538,7 +538,9 @@ xfs_readlink_bmap(
d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
- bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0);
+ bp = xfs_buf_read_flags(mp->m_ddev_targp, d, BTOBB(byte_cnt),
+ XBF_LOCK | XBF_MAPPED |
+ XBF_DONT_BLOCK);
error = XFS_BUF_GETERROR(bp);
if (error) {
xfs_ioerror_alert("xfs_readlink",
@@ -609,7 +611,7 @@ xfs_fsync(
xfs_inode_t *ip)
{
xfs_trans_t *tp;
- int error;
+ int error = 0;
int log_flushed = 0, changed = 1;
xfs_itrace_entry(ip);
@@ -617,14 +619,9 @@ xfs_fsync(
if (XFS_FORCED_SHUTDOWN(ip->i_mount))
return XFS_ERROR(EIO);
- /* capture size updates in I/O completion before writing the inode. */
- error = xfs_wait_on_pages(ip, 0, -1);
- if (error)
- return XFS_ERROR(error);
-
/*
* We always need to make sure that the required inode state is safe on
- * disk. The vnode might be clean but we still might need to force the
+ * disk. The inode might be clean but we still might need to force the
* log because of committed transactions that haven't hit the disk yet.
* Likewise, there could be unflushed non-transactional changes to the
* inode core that have to go to disk and this requires us to issue
@@ -636,7 +633,7 @@ xfs_fsync(
*/
xfs_ilock(ip, XFS_ILOCK_SHARED);
- if (!(ip->i_update_size || ip->i_update_core)) {
+ if (!ip->i_update_core) {
/*
* Timestamps/size haven't changed since last inode flush or
* inode transaction commit. That means either nothing got
@@ -716,7 +713,7 @@ xfs_fsync(
* when the link count isn't zero and by xfs_dm_punch_hole() when
* punching a hole to EOF.
*/
-int
+STATIC int
xfs_free_eofblocks(
xfs_mount_t *mp,
xfs_inode_t *ip,
@@ -1474,8 +1471,8 @@ xfs_create(
if (error == ENOSPC) {
/* flush outstanding delalloc blocks and retry */
xfs_flush_inodes(dp);
- error = xfs_trans_reserve(tp, resblks, XFS_CREATE_LOG_RES(mp), 0,
- XFS_TRANS_PERM_LOG_RES, XFS_CREATE_LOG_COUNT);
+ error = xfs_trans_reserve(tp, resblks, log_res, 0,
+ XFS_TRANS_PERM_LOG_RES, log_count);
}
if (error == ENOSPC) {
/* No space at all so try a "no-allocation" reservation */